gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2016 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "cfgloop.h"
  32 #include "df.h"
  33 #include "tm_p.h"
  34 #include "stringpool.h"
  35 #include "optabs.h"
  36 #include "regs.h"
  37 #include "emit-rtl.h"
  38 #include "recog.h"
  39 #include "diagnostic.h"
  40 #include "insn-attr.h"
  41 #include "alias.h"
  42 #include "fold-const.h"
  43 #include "stor-layout.h"
  44 #include "calls.h"
  45 #include "varasm.h"
  46 #include "output.h"
  47 #include "flags.h"
  48 #include "explow.h"
  49 #include "expr.h"
  50 #include "reload.h"
  51 #include "langhooks.h"
  52 #include "opts.h"
  53 #include "params.h"
  54 #include "gimplify.h"
  55 #include "dwarf2.h"
  56 #include "gimple-iterator.h"
  57 #include "tree-vectorizer.h"
  58 #include "aarch64-cost-tables.h"
  59 #include "dumpfile.h"
  60 #include "builtins.h"
  61 #include "rtl-iter.h"
  62 #include "tm-constrs.h"
  63 #include "sched-int.h"
  64 #include "cortex-a57-fma-steering.h"
  65 #include "target-globals.h"
  66 #include "common/common-target.h"
  67
  68 /* This file should be included last.  */
  69 #include "target-def.h"
  70
  71 /* Defined for convenience.  */
  72 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  73
  74 /* Classifies an address.
  75
  76    ADDRESS_REG_IMM
  77        A simple base register plus immediate offset.
  78
  79    ADDRESS_REG_WB
  80        A base register indexed by immediate offset with writeback.
  81
  82    ADDRESS_REG_REG
  83        A base register indexed by (optionally scaled) register.
  84
  85    ADDRESS_REG_UXTW
  86        A base register indexed by (optionally scaled) zero-extended register.
  87
  88    ADDRESS_REG_SXTW
  89        A base register indexed by (optionally scaled) sign-extended register.
  90
  91    ADDRESS_LO_SUM
  92        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  93
  94    ADDRESS_SYMBOLIC:
  95        A constant symbolic address, in pc-relative literal pool.  */
  96
  97 enum aarch64_address_type {
  98   ADDRESS_REG_IMM,
  99   ADDRESS_REG_WB,
 100   ADDRESS_REG_REG,
 101   ADDRESS_REG_UXTW,
 102   ADDRESS_REG_SXTW,
 103   ADDRESS_LO_SUM,
 104   ADDRESS_SYMBOLIC
 105 };
 106
 107 struct aarch64_address_info {
 108   enum aarch64_address_type type;
 109   rtx base;
 110   rtx offset;
 111   int shift;
 112   enum aarch64_symbol_type symbol_type;
 113 };
 114
 115 struct simd_immediate_info
 116 {
 117   rtx value;
 118   int shift;
 119   int element_width;
 120   bool mvn;
 121   bool msl;
 122 };
 123
 124 /* The current code model.  */
 125 enum aarch64_code_model aarch64_cmodel;
 126
 127 #ifdef HAVE_AS_TLS
 128 #undef TARGET_HAVE_TLS
 129 #define TARGET_HAVE_TLS 1
 130 #endif
 131
 132 static bool aarch64_composite_type_p (const_tree, machine_mode);
 133 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 134                                                      const_tree,
 135                                                      machine_mode *, int *,
 136                                                      bool *);
 137 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 138 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 139 static void aarch64_override_options_after_change (void);
 140 static bool aarch64_vector_mode_supported_p (machine_mode);
 141 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 142                                                  const unsigned char *sel);
 143 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 144
 145 /* Major revision number of the ARM Architecture implemented by the target.  */
 146 unsigned aarch64_architecture_version;
 147
 148 /* The processor for which instructions should be scheduled.  */
 149 enum aarch64_processor aarch64_tune = cortexa53;
 150
 151 /* Mask to specify which instruction scheduling options should be used.  */
 152 unsigned long aarch64_tune_flags = 0;
 153
 154 /* Global flag for PC relative loads.  */
 155 bool aarch64_pcrelative_literal_loads;
 156
 157 /* Support for command line parsing of boolean flags in the tuning
 158    structures.  */
 159 struct aarch64_flag_desc
 160 {
 161   const char* name;
 162   unsigned int flag;
 163 };
 164
 165 #define AARCH64_FUSION_PAIR(name, internal_name) \
 166   { name, AARCH64_FUSE_##internal_name },
 167 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 168 {
 169   { "none", AARCH64_FUSE_NOTHING },
 170 #include "aarch64-fusion-pairs.def"
 171   { "all", AARCH64_FUSE_ALL },
 172   { NULL, AARCH64_FUSE_NOTHING }
 173 };
 174 #undef AARCH64_FUION_PAIR
 175
 176 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 177   { name, AARCH64_EXTRA_TUNE_##internal_name },
 178 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 179 {
 180   { "none", AARCH64_EXTRA_TUNE_NONE },
 181 #include "aarch64-tuning-flags.def"
 182   { "all", AARCH64_EXTRA_TUNE_ALL },
 183   { NULL, AARCH64_EXTRA_TUNE_NONE }
 184 };
 185 #undef AARCH64_EXTRA_TUNING_OPTION
 186
 187 /* Tuning parameters.  */
 188
 189 static const struct cpu_addrcost_table generic_addrcost_table =
 190 {
 191     {
 192       0, /* hi  */
 193       0, /* si  */
 194       0, /* di  */
 195       0, /* ti  */
 196     },
 197   0, /* pre_modify  */
 198   0, /* post_modify  */
 199   0, /* register_offset  */
 200   0, /* register_sextend  */
 201   0, /* register_zextend  */
 202   0 /* imm_offset  */
 203 };
 204
 205 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 206 {
 207     {
 208       1, /* hi  */
 209       0, /* si  */
 210       0, /* di  */
 211       1, /* ti  */
 212     },
 213   0, /* pre_modify  */
 214   0, /* post_modify  */
 215   0, /* register_offset  */
 216   0, /* register_sextend  */
 217   0, /* register_zextend  */
 218   0, /* imm_offset  */
 219 };
 220
 221 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 222 {
 223     {
 224       0, /* hi  */
 225       0, /* si  */
 226       0, /* di  */
 227       2, /* ti  */
 228     },
 229   0, /* pre_modify  */
 230   0, /* post_modify  */
 231   1, /* register_offset  */
 232   1, /* register_sextend  */
 233   2, /* register_zextend  */
 234   0, /* imm_offset  */
 235 };
 236
 237 static const struct cpu_addrcost_table xgene1_addrcost_table =
 238 {
 239     {
 240       1, /* hi  */
 241       0, /* si  */
 242       0, /* di  */
 243       1, /* ti  */
 244     },
 245   1, /* pre_modify  */
 246   0, /* post_modify  */
 247   0, /* register_offset  */
 248   1, /* register_sextend  */
 249   1, /* register_zextend  */
 250   0, /* imm_offset  */
 251 };
 252
 253 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 254 {
 255     {
 256       1, /* hi  */
 257       0, /* si  */
 258       0, /* di  */
 259       1, /* ti  */
 260     },
 261   0, /* pre_modify  */
 262   0, /* post_modify  */
 263   0, /* register_offset  */
 264   0, /* register_sextend  */
 265   0, /* register_zextend  */
 266   0 /* imm_offset  */
 267 };
 268
 269 static const struct cpu_addrcost_table vulcan_addrcost_table =
 270 {
 271     {
 272       0, /* hi  */
 273       0, /* si  */
 274       0, /* di  */
 275       2, /* ti  */
 276     },
 277   0, /* pre_modify  */
 278   0, /* post_modify  */
 279   2, /* register_offset  */
 280   3, /* register_sextend  */
 281   3, /* register_zextend  */
 282   0, /* imm_offset  */
 283 };
 284
 285 static const struct cpu_regmove_cost generic_regmove_cost =
 286 {
 287   1, /* GP2GP  */
 288   /* Avoid the use of slow int<->fp moves for spilling by setting
 289      their cost higher than memmov_cost.  */
 290   5, /* GP2FP  */
 291   5, /* FP2GP  */
 292   2 /* FP2FP  */
 293 };
 294
 295 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 296 {
 297   1, /* GP2GP  */
 298   /* Avoid the use of slow int<->fp moves for spilling by setting
 299      their cost higher than memmov_cost.  */
 300   5, /* GP2FP  */
 301   5, /* FP2GP  */
 302   2 /* FP2FP  */
 303 };
 304
 305 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 306 {
 307   1, /* GP2GP  */
 308   /* Avoid the use of slow int<->fp moves for spilling by setting
 309      their cost higher than memmov_cost.  */
 310   5, /* GP2FP  */
 311   5, /* FP2GP  */
 312   2 /* FP2FP  */
 313 };
 314
 315 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 316 {
 317   1, /* GP2GP  */
 318   /* Avoid the use of slow int<->fp moves for spilling by setting
 319      their cost higher than memmov_cost (actual, 4 and 9).  */
 320   9, /* GP2FP  */
 321   9, /* FP2GP  */
 322   1 /* FP2FP  */
 323 };
 324
 325 static const struct cpu_regmove_cost thunderx_regmove_cost =
 326 {
 327   2, /* GP2GP  */
 328   2, /* GP2FP  */
 329   6, /* FP2GP  */
 330   4 /* FP2FP  */
 331 };
 332
 333 static const struct cpu_regmove_cost xgene1_regmove_cost =
 334 {
 335   1, /* GP2GP  */
 336   /* Avoid the use of slow int<->fp moves for spilling by setting
 337      their cost higher than memmov_cost.  */
 338   8, /* GP2FP  */
 339   8, /* FP2GP  */
 340   2 /* FP2FP  */
 341 };
 342
 343 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 344 {
 345   2, /* GP2GP  */
 346   /* Avoid the use of int<->fp moves for spilling.  */
 347   6, /* GP2FP  */
 348   6, /* FP2GP  */
 349   4 /* FP2FP  */
 350 };
 351
 352 static const struct cpu_regmove_cost vulcan_regmove_cost =
 353 {
 354   1, /* GP2GP  */
 355   /* Avoid the use of int<->fp moves for spilling.  */
 356   8, /* GP2FP  */
 357   8, /* FP2GP  */
 358   4  /* FP2FP  */
 359 };
 360
 361 /* Generic costs for vector insn classes.  */
 362 static const struct cpu_vector_cost generic_vector_cost =
 363 {
 364   1, /* scalar_stmt_cost  */
 365   1, /* scalar_load_cost  */
 366   1, /* scalar_store_cost  */
 367   1, /* vec_stmt_cost  */
 368   2, /* vec_permute_cost  */
 369   1, /* vec_to_scalar_cost  */
 370   1, /* scalar_to_vec_cost  */
 371   1, /* vec_align_load_cost  */
 372   1, /* vec_unalign_load_cost  */
 373   1, /* vec_unalign_store_cost  */
 374   1, /* vec_store_cost  */
 375   3, /* cond_taken_branch_cost  */
 376   1 /* cond_not_taken_branch_cost  */
 377 };
 378
 379 /* ThunderX costs for vector insn classes.  */
 380 static const struct cpu_vector_cost thunderx_vector_cost =
 381 {
 382   1, /* scalar_stmt_cost  */
 383   3, /* scalar_load_cost  */
 384   1, /* scalar_store_cost  */
 385   4, /* vec_stmt_cost  */
 386   4, /* vec_permute_cost  */
 387   2, /* vec_to_scalar_cost  */
 388   2, /* scalar_to_vec_cost  */
 389   3, /* vec_align_load_cost  */
 390   10, /* vec_unalign_load_cost  */
 391   10, /* vec_unalign_store_cost  */
 392   1, /* vec_store_cost  */
 393   3, /* cond_taken_branch_cost  */
 394   3 /* cond_not_taken_branch_cost  */
 395 };
 396
 397 /* Generic costs for vector insn classes.  */
 398 static const struct cpu_vector_cost cortexa57_vector_cost =
 399 {
 400   1, /* scalar_stmt_cost  */
 401   4, /* scalar_load_cost  */
 402   1, /* scalar_store_cost  */
 403   3, /* vec_stmt_cost  */
 404   3, /* vec_permute_cost  */
 405   8, /* vec_to_scalar_cost  */
 406   8, /* scalar_to_vec_cost  */
 407   5, /* vec_align_load_cost  */
 408   5, /* vec_unalign_load_cost  */
 409   1, /* vec_unalign_store_cost  */
 410   1, /* vec_store_cost  */
 411   1, /* cond_taken_branch_cost  */
 412   1 /* cond_not_taken_branch_cost  */
 413 };
 414
 415 static const struct cpu_vector_cost exynosm1_vector_cost =
 416 {
 417   1, /* scalar_stmt_cost  */
 418   5, /* scalar_load_cost  */
 419   1, /* scalar_store_cost  */
 420   3, /* vec_stmt_cost  */
 421   3, /* vec_permute_cost  */
 422   3, /* vec_to_scalar_cost  */
 423   3, /* scalar_to_vec_cost  */
 424   5, /* vec_align_load_cost  */
 425   5, /* vec_unalign_load_cost  */
 426   1, /* vec_unalign_store_cost  */
 427   1, /* vec_store_cost  */
 428   1, /* cond_taken_branch_cost  */
 429   1 /* cond_not_taken_branch_cost  */
 430 };
 431
 432 /* Generic costs for vector insn classes.  */
 433 static const struct cpu_vector_cost xgene1_vector_cost =
 434 {
 435   1, /* scalar_stmt_cost  */
 436   5, /* scalar_load_cost  */
 437   1, /* scalar_store_cost  */
 438   2, /* vec_stmt_cost  */
 439   2, /* vec_permute_cost  */
 440   4, /* vec_to_scalar_cost  */
 441   4, /* scalar_to_vec_cost  */
 442   10, /* vec_align_load_cost  */
 443   10, /* vec_unalign_load_cost  */
 444   2, /* vec_unalign_store_cost  */
 445   2, /* vec_store_cost  */
 446   2, /* cond_taken_branch_cost  */
 447   1 /* cond_not_taken_branch_cost  */
 448 };
 449
 450 /* Costs for vector insn classes for Vulcan.  */
 451 static const struct cpu_vector_cost vulcan_vector_cost =
 452 {
 453   6, /* scalar_stmt_cost  */
 454   4, /* scalar_load_cost  */
 455   1, /* scalar_store_cost  */
 456   6, /* vec_stmt_cost  */
 457   3, /* vec_permute_cost  */
 458   6, /* vec_to_scalar_cost  */
 459   5, /* scalar_to_vec_cost  */
 460   8, /* vec_align_load_cost  */
 461   8, /* vec_unalign_load_cost  */
 462   4, /* vec_unalign_store_cost  */
 463   4, /* vec_store_cost  */
 464   2, /* cond_taken_branch_cost  */
 465   1  /* cond_not_taken_branch_cost  */
 466 };
 467
 468 /* Generic costs for branch instructions.  */
 469 static const struct cpu_branch_cost generic_branch_cost =
 470 {
 471   2,  /* Predictable.  */
 472   2   /* Unpredictable.  */
 473 };
 474
 475 /* Branch costs for Cortex-A57.  */
 476 static const struct cpu_branch_cost cortexa57_branch_cost =
 477 {
 478   1,  /* Predictable.  */
 479   3   /* Unpredictable.  */
 480 };
 481
 482 /* Branch costs for Vulcan.  */
 483 static const struct cpu_branch_cost vulcan_branch_cost =
 484 {
 485   1,  /* Predictable.  */
 486   3   /* Unpredictable.  */
 487 };
 488
 489 /* Generic approximation modes.  */
 490 static const cpu_approx_modes generic_approx_modes =
 491 {
 492   AARCH64_APPROX_NONE,  /* division  */
 493   AARCH64_APPROX_NONE,  /* sqrt  */
 494   AARCH64_APPROX_NONE   /* recip_sqrt  */
 495 };
 496
 497 /* Approximation modes for Exynos M1.  */
 498 static const cpu_approx_modes exynosm1_approx_modes =
 499 {
 500   AARCH64_APPROX_NONE,  /* division  */
 501   AARCH64_APPROX_ALL,   /* sqrt  */
 502   AARCH64_APPROX_ALL    /* recip_sqrt  */
 503 };
 504
 505 /* Approximation modes for X-Gene 1.  */
 506 static const cpu_approx_modes xgene1_approx_modes =
 507 {
 508   AARCH64_APPROX_NONE,  /* division  */
 509   AARCH64_APPROX_NONE,  /* sqrt  */
 510   AARCH64_APPROX_ALL    /* recip_sqrt  */
 511 };
 512
 513 static const struct tune_params generic_tunings =
 514 {
 515   &cortexa57_extra_costs,
 516   &generic_addrcost_table,
 517   &generic_regmove_cost,
 518   &generic_vector_cost,
 519   &generic_branch_cost,
 520   &generic_approx_modes,
 521   4, /* memmov_cost  */
 522   2, /* issue_rate  */
 523   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 524   8,    /* function_align.  */
 525   8,    /* jump_align.  */
 526   4,    /* loop_align.  */
 527   2,    /* int_reassoc_width.  */
 528   4,    /* fp_reassoc_width.  */
 529   1,    /* vec_reassoc_width.  */
 530   2,    /* min_div_recip_mul_sf.  */
 531   2,    /* min_div_recip_mul_df.  */
 532   0,    /* max_case_values.  */
 533   0,    /* cache_line_size.  */
 534   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 535   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 536 };
 537
 538 static const struct tune_params cortexa35_tunings =
 539 {
 540   &cortexa53_extra_costs,
 541   &generic_addrcost_table,
 542   &cortexa53_regmove_cost,
 543   &generic_vector_cost,
 544   &cortexa57_branch_cost,
 545   &generic_approx_modes,
 546   4, /* memmov_cost  */
 547   1, /* issue_rate  */
 548   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 549    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 550   16,   /* function_align.  */
 551   8,    /* jump_align.  */
 552   8,    /* loop_align.  */
 553   2,    /* int_reassoc_width.  */
 554   4,    /* fp_reassoc_width.  */
 555   1,    /* vec_reassoc_width.  */
 556   2,    /* min_div_recip_mul_sf.  */
 557   2,    /* min_div_recip_mul_df.  */
 558   0,    /* max_case_values.  */
 559   0,    /* cache_line_size.  */
 560   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 561   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 562 };
 563
 564 static const struct tune_params cortexa53_tunings =
 565 {
 566   &cortexa53_extra_costs,
 567   &generic_addrcost_table,
 568   &cortexa53_regmove_cost,
 569   &generic_vector_cost,
 570   &cortexa57_branch_cost,
 571   &generic_approx_modes,
 572   4, /* memmov_cost  */
 573   2, /* issue_rate  */
 574   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 575    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 576   16,   /* function_align.  */
 577   8,    /* jump_align.  */
 578   8,    /* loop_align.  */
 579   2,    /* int_reassoc_width.  */
 580   4,    /* fp_reassoc_width.  */
 581   1,    /* vec_reassoc_width.  */
 582   2,    /* min_div_recip_mul_sf.  */
 583   2,    /* min_div_recip_mul_df.  */
 584   0,    /* max_case_values.  */
 585   0,    /* cache_line_size.  */
 586   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 587   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 588 };
 589
 590 static const struct tune_params cortexa57_tunings =
 591 {
 592   &cortexa57_extra_costs,
 593   &cortexa57_addrcost_table,
 594   &cortexa57_regmove_cost,
 595   &cortexa57_vector_cost,
 596   &cortexa57_branch_cost,
 597   &generic_approx_modes,
 598   4, /* memmov_cost  */
 599   3, /* issue_rate  */
 600   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 601    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 602   16,   /* function_align.  */
 603   8,    /* jump_align.  */
 604   8,    /* loop_align.  */
 605   2,    /* int_reassoc_width.  */
 606   4,    /* fp_reassoc_width.  */
 607   1,    /* vec_reassoc_width.  */
 608   2,    /* min_div_recip_mul_sf.  */
 609   2,    /* min_div_recip_mul_df.  */
 610   0,    /* max_case_values.  */
 611   0,    /* cache_line_size.  */
 612   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 613   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)  /* tune_flags.  */
 614 };
 615
 616 static const struct tune_params cortexa72_tunings =
 617 {
 618   &cortexa57_extra_costs,
 619   &cortexa57_addrcost_table,
 620   &cortexa57_regmove_cost,
 621   &cortexa57_vector_cost,
 622   &cortexa57_branch_cost,
 623   &generic_approx_modes,
 624   4, /* memmov_cost  */
 625   3, /* issue_rate  */
 626   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 627    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 628   16,   /* function_align.  */
 629   8,    /* jump_align.  */
 630   8,    /* loop_align.  */
 631   2,    /* int_reassoc_width.  */
 632   4,    /* fp_reassoc_width.  */
 633   1,    /* vec_reassoc_width.  */
 634   2,    /* min_div_recip_mul_sf.  */
 635   2,    /* min_div_recip_mul_df.  */
 636   0,    /* max_case_values.  */
 637   0,    /* cache_line_size.  */
 638   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 639   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 640 };
 641
 642 static const struct tune_params cortexa73_tunings =
 643 {
 644   &cortexa57_extra_costs,
 645   &cortexa57_addrcost_table,
 646   &cortexa57_regmove_cost,
 647   &cortexa57_vector_cost,
 648   &cortexa57_branch_cost,
 649   &generic_approx_modes,
 650   4, /* memmov_cost.  */
 651   2, /* issue_rate.  */
 652   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 653    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 654   16,   /* function_align.  */
 655   8,    /* jump_align.  */
 656   8,    /* loop_align.  */
 657   2,    /* int_reassoc_width.  */
 658   4,    /* fp_reassoc_width.  */
 659   1,    /* vec_reassoc_width.  */
 660   2,    /* min_div_recip_mul_sf.  */
 661   2,    /* min_div_recip_mul_df.  */
 662   0,    /* max_case_values.  */
 663   0,    /* cache_line_size.  */
 664   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 665   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 666 };
 667
 668 static const struct tune_params exynosm1_tunings =
 669 {
 670   &exynosm1_extra_costs,
 671   &exynosm1_addrcost_table,
 672   &exynosm1_regmove_cost,
 673   &exynosm1_vector_cost,
 674   &generic_branch_cost,
 675   &exynosm1_approx_modes,
 676   4,    /* memmov_cost  */
 677   3,    /* issue_rate  */
 678   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 679   4,    /* function_align.  */
 680   4,    /* jump_align.  */
 681   4,    /* loop_align.  */
 682   2,    /* int_reassoc_width.  */
 683   4,    /* fp_reassoc_width.  */
 684   1,    /* vec_reassoc_width.  */
 685   2,    /* min_div_recip_mul_sf.  */
 686   2,    /* min_div_recip_mul_df.  */
 687   48,   /* max_case_values.  */
 688   64,   /* cache_line_size.  */
 689   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 690   (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
 691 };
 692
 693 static const struct tune_params thunderx_tunings =
 694 {
 695   &thunderx_extra_costs,
 696   &generic_addrcost_table,
 697   &thunderx_regmove_cost,
 698   &thunderx_vector_cost,
 699   &generic_branch_cost,
 700   &generic_approx_modes,
 701   6, /* memmov_cost  */
 702   2, /* issue_rate  */
 703   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 704   8,    /* function_align.  */
 705   8,    /* jump_align.  */
 706   8,    /* loop_align.  */
 707   2,    /* int_reassoc_width.  */
 708   4,    /* fp_reassoc_width.  */
 709   1,    /* vec_reassoc_width.  */
 710   2,    /* min_div_recip_mul_sf.  */
 711   2,    /* min_div_recip_mul_df.  */
 712   0,    /* max_case_values.  */
 713   0,    /* cache_line_size.  */
 714   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 715   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 716 };
 717
 718 static const struct tune_params xgene1_tunings =
 719 {
 720   &xgene1_extra_costs,
 721   &xgene1_addrcost_table,
 722   &xgene1_regmove_cost,
 723   &xgene1_vector_cost,
 724   &generic_branch_cost,
 725   &xgene1_approx_modes,
 726   6, /* memmov_cost  */
 727   4, /* issue_rate  */
 728   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 729   16,   /* function_align.  */
 730   8,    /* jump_align.  */
 731   16,   /* loop_align.  */
 732   2,    /* int_reassoc_width.  */
 733   4,    /* fp_reassoc_width.  */
 734   1,    /* vec_reassoc_width.  */
 735   2,    /* min_div_recip_mul_sf.  */
 736   2,    /* min_div_recip_mul_df.  */
 737   0,    /* max_case_values.  */
 738   0,    /* cache_line_size.  */
 739   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 740   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 741 };
 742
 743 static const struct tune_params qdf24xx_tunings =
 744 {
 745   &qdf24xx_extra_costs,
 746   &qdf24xx_addrcost_table,
 747   &qdf24xx_regmove_cost,
 748   &generic_vector_cost,
 749   &generic_branch_cost,
 750   &generic_approx_modes,
 751   4, /* memmov_cost  */
 752   4, /* issue_rate  */
 753   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 754    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 755   16,   /* function_align.  */
 756   8,    /* jump_align.  */
 757   16,   /* loop_align.  */
 758   2,    /* int_reassoc_width.  */
 759   4,    /* fp_reassoc_width.  */
 760   1,    /* vec_reassoc_width.  */
 761   2,    /* min_div_recip_mul_sf.  */
 762   2,    /* min_div_recip_mul_df.  */
 763   0,    /* max_case_values.  */
 764   64,   /* cache_line_size.  */
 765   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 766   (AARCH64_EXTRA_TUNE_NONE)             /* tune_flags.  */
 767 };
 768
 769 static const struct tune_params vulcan_tunings =
 770 {
 771   &vulcan_extra_costs,
 772   &vulcan_addrcost_table,
 773   &vulcan_regmove_cost,
 774   &vulcan_vector_cost,
 775   &vulcan_branch_cost,
 776   &generic_approx_modes,
 777   4, /* memmov_cost.  */
 778   4, /* issue_rate.  */
 779   AARCH64_FUSE_NOTHING, /* fuseable_ops.  */
 780   16,   /* function_align.  */
 781   8,    /* jump_align.  */
 782   16,   /* loop_align.  */
 783   3,    /* int_reassoc_width.  */
 784   2,    /* fp_reassoc_width.  */
 785   2,    /* vec_reassoc_width.  */
 786   2,    /* min_div_recip_mul_sf.  */
 787   2,    /* min_div_recip_mul_df.  */
 788   0,    /* max_case_values.  */
 789   64,   /* cache_line_size.  */
 790   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 791   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 792 };
 793
 794 /* Support for fine-grained override of the tuning structures.  */
 795 struct aarch64_tuning_override_function
 796 {
 797   const char* name;
 798   void (*parse_override)(const char*, struct tune_params*);
 799 };
 800
 801 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 802 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 803
 804 static const struct aarch64_tuning_override_function
 805 aarch64_tuning_override_functions[] =
 806 {
 807   { "fuse", aarch64_parse_fuse_string },
 808   { "tune", aarch64_parse_tune_string },
 809   { NULL, NULL }
 810 };
 811
 812 /* A processor implementing AArch64.  */
 813 struct processor
 814 {
 815   const char *const name;
 816   enum aarch64_processor ident;
 817   enum aarch64_processor sched_core;
 818   enum aarch64_arch arch;
 819   unsigned architecture_version;
 820   const unsigned long flags;
 821   const struct tune_params *const tune;
 822 };
 823
 824 /* Architectures implementing AArch64.  */
 825 static const struct processor all_architectures[] =
 826 {
 827 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 828   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 829 #include "aarch64-arches.def"
 830 #undef AARCH64_ARCH
 831   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 832 };
 833
 834 /* Processor cores implementing AArch64.  */
 835 static const struct processor all_cores[] =
 836 {
 837 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 838   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 839   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 840   FLAGS, &COSTS##_tunings},
 841 #include "aarch64-cores.def"
 842 #undef AARCH64_CORE
 843   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 844     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 845   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 846 };
 847
 848
 849 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 850    handling code or by target attributes.  */
 851 static const struct processor *selected_arch;
 852 static const struct processor *selected_cpu;
 853 static const struct processor *selected_tune;
 854
 855 /* The current tuning set.  */
 856 struct tune_params aarch64_tune_params = generic_tunings;
 857
 858 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 859
 860 /* An ISA extension in the co-processor and main instruction set space.  */
 861 struct aarch64_option_extension
 862 {
 863   const char *const name;
 864   const unsigned long flags_on;
 865   const unsigned long flags_off;
 866 };
 867
 868 typedef enum aarch64_cond_code
 869 {
 870   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 871   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 872   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 873 }
 874 aarch64_cc;
 875
 876 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 877
 878 /* The condition codes of the processor, and the inverse function.  */
 879 static const char * const aarch64_condition_codes[] =
 880 {
 881   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 882   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 883 };
 884
 885 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 886 const char *
 887 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 888                         const char * branch_format)
 889 {
 890     rtx_code_label * tmp_label = gen_label_rtx ();
 891     char label_buf[256];
 892     char buffer[128];
 893     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 894                                  CODE_LABEL_NUMBER (tmp_label));
 895     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 896     rtx dest_label = operands[pos_label];
 897     operands[pos_label] = tmp_label;
 898
 899     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 900     output_asm_insn (buffer, operands);
 901
 902     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 903     operands[pos_label] = dest_label;
 904     output_asm_insn (buffer, operands);
 905     return "";
 906 }
 907
 908 void
 909 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 910 {
 911   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 912   if (TARGET_GENERAL_REGS_ONLY)
 913     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 914   else
 915     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 916 }
 917
 918 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 919    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 920    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 921    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 922    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 923    irrespectively of its cost results in bad allocations with many redundant
 924    int<->FP moves which are expensive on various cores.
 925    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 926    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 927    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 928    Otherwise set the allocno class depending on the mode.
 929    The result of this is that it is no longer inefficient to have a higher
 930    memory move cost than the register move cost.
 931 */
 932
 933 static reg_class_t
 934 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 935                                          reg_class_t best_class)
 936 {
 937   enum machine_mode mode;
 938
 939   if (allocno_class != ALL_REGS)
 940     return allocno_class;
 941
 942   if (best_class != ALL_REGS)
 943     return best_class;
 944
 945   mode = PSEUDO_REGNO_MODE (regno);
 946   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
 947 }
 948
 949 static unsigned int
 950 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 951 {
 952   if (GET_MODE_UNIT_SIZE (mode) == 4)
 953     return aarch64_tune_params.min_div_recip_mul_sf;
 954   return aarch64_tune_params.min_div_recip_mul_df;
 955 }
 956
 957 static int
 958 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 959                              enum machine_mode mode)
 960 {
 961   if (VECTOR_MODE_P (mode))
 962     return aarch64_tune_params.vec_reassoc_width;
 963   if (INTEGRAL_MODE_P (mode))
 964     return aarch64_tune_params.int_reassoc_width;
 965   if (FLOAT_MODE_P (mode))
 966     return aarch64_tune_params.fp_reassoc_width;
 967   return 1;
 968 }
 969
 970 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 971 unsigned
 972 aarch64_dbx_register_number (unsigned regno)
 973 {
 974    if (GP_REGNUM_P (regno))
 975      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 976    else if (regno == SP_REGNUM)
 977      return AARCH64_DWARF_SP;
 978    else if (FP_REGNUM_P (regno))
 979      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 980
 981    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 982       equivalent DWARF register.  */
 983    return DWARF_FRAME_REGISTERS;
 984 }
 985
 986 /* Return TRUE if MODE is any of the large INT modes.  */
 987 static bool
 988 aarch64_vect_struct_mode_p (machine_mode mode)
 989 {
 990   return mode == OImode || mode == CImode || mode == XImode;
 991 }
 992
 993 /* Return TRUE if MODE is any of the vector modes.  */
 994 static bool
 995 aarch64_vector_mode_p (machine_mode mode)
 996 {
 997   return aarch64_vector_mode_supported_p (mode)
 998          || aarch64_vect_struct_mode_p (mode);
 999 }
1000
1001 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1002 static bool
1003 aarch64_array_mode_supported_p (machine_mode mode,
1004                                 unsigned HOST_WIDE_INT nelems)
1005 {
1006   if (TARGET_SIMD
1007       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1008           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1009       && (nelems >= 2 && nelems <= 4))
1010     return true;
1011
1012   return false;
1013 }
1014
1015 /* Implement HARD_REGNO_NREGS.  */
1016
1017 int
1018 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1019 {
1020   switch (aarch64_regno_regclass (regno))
1021     {
1022     case FP_REGS:
1023     case FP_LO_REGS:
1024       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1025     default:
1026       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1027     }
1028   gcc_unreachable ();
1029 }
1030
1031 /* Implement HARD_REGNO_MODE_OK.  */
1032
1033 int
1034 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1035 {
1036   if (GET_MODE_CLASS (mode) == MODE_CC)
1037     return regno == CC_REGNUM;
1038
1039   if (regno == SP_REGNUM)
1040     /* The purpose of comparing with ptr_mode is to support the
1041        global register variable associated with the stack pointer
1042        register via the syntax of asm ("wsp") in ILP32.  */
1043     return mode == Pmode || mode == ptr_mode;
1044
1045   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1046     return mode == Pmode;
1047
1048   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1049     return 1;
1050
1051   if (FP_REGNUM_P (regno))
1052     {
1053       if (aarch64_vect_struct_mode_p (mode))
1054         return
1055           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1056       else
1057         return 1;
1058     }
1059
1060   return 0;
1061 }
1062
1063 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1064 machine_mode
1065 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1066                                      machine_mode mode)
1067 {
1068   /* Handle modes that fit within single registers.  */
1069   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1070     {
1071       if (GET_MODE_SIZE (mode) >= 4)
1072         return mode;
1073       else
1074         return SImode;
1075     }
1076   /* Fall back to generic for multi-reg and very large modes.  */
1077   else
1078     return choose_hard_reg_mode (regno, nregs, false);
1079 }
1080
1081 /* Return true if calls to DECL should be treated as
1082    long-calls (ie called via a register).  */
1083 static bool
1084 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1085 {
1086   return false;
1087 }
1088
1089 /* Return true if calls to symbol-ref SYM should be treated as
1090    long-calls (ie called via a register).  */
1091 bool
1092 aarch64_is_long_call_p (rtx sym)
1093 {
1094   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1095 }
1096
1097 /* Return true if calls to symbol-ref SYM should not go through
1098    plt stubs.  */
1099
1100 bool
1101 aarch64_is_noplt_call_p (rtx sym)
1102 {
1103   const_tree decl = SYMBOL_REF_DECL (sym);
1104
1105   if (flag_pic
1106       && decl
1107       && (!flag_plt
1108           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1109       && !targetm.binds_local_p (decl))
1110     return true;
1111
1112   return false;
1113 }
1114
1115 /* Return true if the offsets to a zero/sign-extract operation
1116    represent an expression that matches an extend operation.  The
1117    operands represent the paramters from
1118
1119    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1120 bool
1121 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1122                                 rtx extract_imm)
1123 {
1124   HOST_WIDE_INT mult_val, extract_val;
1125
1126   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1127     return false;
1128
1129   mult_val = INTVAL (mult_imm);
1130   extract_val = INTVAL (extract_imm);
1131
1132   if (extract_val > 8
1133       && extract_val < GET_MODE_BITSIZE (mode)
1134       && exact_log2 (extract_val & ~7) > 0
1135       && (extract_val & 7) <= 4
1136       && mult_val == (1 << (extract_val & 7)))
1137     return true;
1138
1139   return false;
1140 }
1141
1142 /* Emit an insn that's a simple single-set.  Both the operands must be
1143    known to be valid.  */
1144 inline static rtx
1145 emit_set_insn (rtx x, rtx y)
1146 {
1147   return emit_insn (gen_rtx_SET (x, y));
1148 }
1149
1150 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1151    return the rtx for register 0 in the proper mode.  */
1152 rtx
1153 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1154 {
1155   machine_mode mode = SELECT_CC_MODE (code, x, y);
1156   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1157
1158   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1159   return cc_reg;
1160 }
1161
1162 /* Build the SYMBOL_REF for __tls_get_addr.  */
1163
1164 static GTY(()) rtx tls_get_addr_libfunc;
1165
1166 rtx
1167 aarch64_tls_get_addr (void)
1168 {
1169   if (!tls_get_addr_libfunc)
1170     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1171   return tls_get_addr_libfunc;
1172 }
1173
1174 /* Return the TLS model to use for ADDR.  */
1175
1176 static enum tls_model
1177 tls_symbolic_operand_type (rtx addr)
1178 {
1179   enum tls_model tls_kind = TLS_MODEL_NONE;
1180   rtx sym, addend;
1181
1182   if (GET_CODE (addr) == CONST)
1183     {
1184       split_const (addr, &sym, &addend);
1185       if (GET_CODE (sym) == SYMBOL_REF)
1186         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1187     }
1188   else if (GET_CODE (addr) == SYMBOL_REF)
1189     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1190
1191   return tls_kind;
1192 }
1193
1194 /* We'll allow lo_sum's in addresses in our legitimate addresses
1195    so that combine would take care of combining addresses where
1196    necessary, but for generation purposes, we'll generate the address
1197    as :
1198    RTL                               Absolute
1199    tmp = hi (symbol_ref);            adrp  x1, foo
1200    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1201                                      nop
1202
1203    PIC                               TLS
1204    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1205    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1206                                      bl   __tls_get_addr
1207                                      nop
1208
1209    Load TLS symbol, depending on TLS mechanism and TLS access model.
1210
1211    Global Dynamic - Traditional TLS:
1212    adrp tmp, :tlsgd:imm
1213    add  dest, tmp, #:tlsgd_lo12:imm
1214    bl   __tls_get_addr
1215
1216    Global Dynamic - TLS Descriptors:
1217    adrp dest, :tlsdesc:imm
1218    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1219    add  dest, dest, #:tlsdesc_lo12:imm
1220    blr  tmp
1221    mrs  tp, tpidr_el0
1222    add  dest, dest, tp
1223
1224    Initial Exec:
1225    mrs  tp, tpidr_el0
1226    adrp tmp, :gottprel:imm
1227    ldr  dest, [tmp, #:gottprel_lo12:imm]
1228    add  dest, dest, tp
1229
1230    Local Exec:
1231    mrs  tp, tpidr_el0
1232    add  t0, tp, #:tprel_hi12:imm, lsl #12
1233    add  t0, t0, #:tprel_lo12_nc:imm
1234 */
1235
1236 static void
1237 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1238                                    enum aarch64_symbol_type type)
1239 {
1240   switch (type)
1241     {
1242     case SYMBOL_SMALL_ABSOLUTE:
1243       {
1244         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1245         rtx tmp_reg = dest;
1246         machine_mode mode = GET_MODE (dest);
1247
1248         gcc_assert (mode == Pmode || mode == ptr_mode);
1249
1250         if (can_create_pseudo_p ())
1251           tmp_reg = gen_reg_rtx (mode);
1252
1253         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1254         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1255         return;
1256       }
1257
1258     case SYMBOL_TINY_ABSOLUTE:
1259       emit_insn (gen_rtx_SET (dest, imm));
1260       return;
1261
1262     case SYMBOL_SMALL_GOT_28K:
1263       {
1264         machine_mode mode = GET_MODE (dest);
1265         rtx gp_rtx = pic_offset_table_rtx;
1266         rtx insn;
1267         rtx mem;
1268
1269         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1270            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1271            decide rtx costs, in which case pic_offset_table_rtx is not
1272            initialized.  For that case no need to generate the first adrp
1273            instruction as the final cost for global variable access is
1274            one instruction.  */
1275         if (gp_rtx != NULL)
1276           {
1277             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1278                using the page base as GOT base, the first page may be wasted,
1279                in the worst scenario, there is only 28K space for GOT).
1280
1281                The generate instruction sequence for accessing global variable
1282                is:
1283
1284                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1285
1286                Only one instruction needed. But we must initialize
1287                pic_offset_table_rtx properly.  We generate initialize insn for
1288                every global access, and allow CSE to remove all redundant.
1289
1290                The final instruction sequences will look like the following
1291                for multiply global variables access.
1292
1293                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1294
1295                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1296                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1297                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1298                  ...  */
1299
1300             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1301             crtl->uses_pic_offset_table = 1;
1302             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1303
1304             if (mode != GET_MODE (gp_rtx))
1305               gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
1306           }
1307
1308         if (mode == ptr_mode)
1309           {
1310             if (mode == DImode)
1311               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1312             else
1313               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1314
1315             mem = XVECEXP (SET_SRC (insn), 0, 0);
1316           }
1317         else
1318           {
1319             gcc_assert (mode == Pmode);
1320
1321             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1322             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1323           }
1324
1325         /* The operand is expected to be MEM.  Whenever the related insn
1326            pattern changed, above code which calculate mem should be
1327            updated.  */
1328         gcc_assert (GET_CODE (mem) == MEM);
1329         MEM_READONLY_P (mem) = 1;
1330         MEM_NOTRAP_P (mem) = 1;
1331         emit_insn (insn);
1332         return;
1333       }
1334
1335     case SYMBOL_SMALL_GOT_4G:
1336       {
1337         /* In ILP32, the mode of dest can be either SImode or DImode,
1338            while the got entry is always of SImode size.  The mode of
1339            dest depends on how dest is used: if dest is assigned to a
1340            pointer (e.g. in the memory), it has SImode; it may have
1341            DImode if dest is dereferenced to access the memeory.
1342            This is why we have to handle three different ldr_got_small
1343            patterns here (two patterns for ILP32).  */
1344
1345         rtx insn;
1346         rtx mem;
1347         rtx tmp_reg = dest;
1348         machine_mode mode = GET_MODE (dest);
1349
1350         if (can_create_pseudo_p ())
1351           tmp_reg = gen_reg_rtx (mode);
1352
1353         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1354         if (mode == ptr_mode)
1355           {
1356             if (mode == DImode)
1357               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1358             else
1359               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1360
1361             mem = XVECEXP (SET_SRC (insn), 0, 0);
1362           }
1363         else
1364           {
1365             gcc_assert (mode == Pmode);
1366
1367             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1368             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1369           }
1370
1371         gcc_assert (GET_CODE (mem) == MEM);
1372         MEM_READONLY_P (mem) = 1;
1373         MEM_NOTRAP_P (mem) = 1;
1374         emit_insn (insn);
1375         return;
1376       }
1377
1378     case SYMBOL_SMALL_TLSGD:
1379       {
1380         rtx_insn *insns;
1381         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1382
1383         start_sequence ();
1384         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1385         insns = get_insns ();
1386         end_sequence ();
1387
1388         RTL_CONST_CALL_P (insns) = 1;
1389         emit_libcall_block (insns, dest, result, imm);
1390         return;
1391       }
1392
1393     case SYMBOL_SMALL_TLSDESC:
1394       {
1395         machine_mode mode = GET_MODE (dest);
1396         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1397         rtx tp;
1398
1399         gcc_assert (mode == Pmode || mode == ptr_mode);
1400
1401         /* In ILP32, the got entry is always of SImode size.  Unlike
1402            small GOT, the dest is fixed at reg 0.  */
1403         if (TARGET_ILP32)
1404           emit_insn (gen_tlsdesc_small_si (imm));
1405         else
1406           emit_insn (gen_tlsdesc_small_di (imm));
1407         tp = aarch64_load_tp (NULL);
1408
1409         if (mode != Pmode)
1410           tp = gen_lowpart (mode, tp);
1411
1412         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1413         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1414         return;
1415       }
1416
1417     case SYMBOL_SMALL_TLSIE:
1418       {
1419         /* In ILP32, the mode of dest can be either SImode or DImode,
1420            while the got entry is always of SImode size.  The mode of
1421            dest depends on how dest is used: if dest is assigned to a
1422            pointer (e.g. in the memory), it has SImode; it may have
1423            DImode if dest is dereferenced to access the memeory.
1424            This is why we have to handle three different tlsie_small
1425            patterns here (two patterns for ILP32).  */
1426         machine_mode mode = GET_MODE (dest);
1427         rtx tmp_reg = gen_reg_rtx (mode);
1428         rtx tp = aarch64_load_tp (NULL);
1429
1430         if (mode == ptr_mode)
1431           {
1432             if (mode == DImode)
1433               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1434             else
1435               {
1436                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1437                 tp = gen_lowpart (mode, tp);
1438               }
1439           }
1440         else
1441           {
1442             gcc_assert (mode == Pmode);
1443             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1444           }
1445
1446         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1447         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1448         return;
1449       }
1450
1451     case SYMBOL_TLSLE12:
1452     case SYMBOL_TLSLE24:
1453     case SYMBOL_TLSLE32:
1454     case SYMBOL_TLSLE48:
1455       {
1456         machine_mode mode = GET_MODE (dest);
1457         rtx tp = aarch64_load_tp (NULL);
1458
1459         if (mode != Pmode)
1460           tp = gen_lowpart (mode, tp);
1461
1462         switch (type)
1463           {
1464           case SYMBOL_TLSLE12:
1465             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1466                         (dest, tp, imm));
1467             break;
1468           case SYMBOL_TLSLE24:
1469             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1470                         (dest, tp, imm));
1471           break;
1472           case SYMBOL_TLSLE32:
1473             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1474                         (dest, imm));
1475             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1476                         (dest, dest, tp));
1477           break;
1478           case SYMBOL_TLSLE48:
1479             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1480                         (dest, imm));
1481             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1482                         (dest, dest, tp));
1483             break;
1484           default:
1485             gcc_unreachable ();
1486           }
1487
1488         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1489         return;
1490       }
1491
1492     case SYMBOL_TINY_GOT:
1493       emit_insn (gen_ldr_got_tiny (dest, imm));
1494       return;
1495
1496     case SYMBOL_TINY_TLSIE:
1497       {
1498         machine_mode mode = GET_MODE (dest);
1499         rtx tp = aarch64_load_tp (NULL);
1500
1501         if (mode == ptr_mode)
1502           {
1503             if (mode == DImode)
1504               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1505             else
1506               {
1507                 tp = gen_lowpart (mode, tp);
1508                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1509               }
1510           }
1511         else
1512           {
1513             gcc_assert (mode == Pmode);
1514             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1515           }
1516
1517         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1518         return;
1519       }
1520
1521     default:
1522       gcc_unreachable ();
1523     }
1524 }
1525
1526 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1527    handle all moves if !can_create_pseudo_p ().  The distinction is
1528    important because, unlike emit_move_insn, the move expanders know
1529    how to force Pmode objects into the constant pool even when the
1530    constant pool address is not itself legitimate.  */
1531 static rtx
1532 aarch64_emit_move (rtx dest, rtx src)
1533 {
1534   return (can_create_pseudo_p ()
1535           ? emit_move_insn (dest, src)
1536           : emit_move_insn_1 (dest, src));
1537 }
1538
1539 /* Split a 128-bit move operation into two 64-bit move operations,
1540    taking care to handle partial overlap of register to register
1541    copies.  Special cases are needed when moving between GP regs and
1542    FP regs.  SRC can be a register, constant or memory; DST a register
1543    or memory.  If either operand is memory it must not have any side
1544    effects.  */
1545 void
1546 aarch64_split_128bit_move (rtx dst, rtx src)
1547 {
1548   rtx dst_lo, dst_hi;
1549   rtx src_lo, src_hi;
1550
1551   machine_mode mode = GET_MODE (dst);
1552
1553   gcc_assert (mode == TImode || mode == TFmode);
1554   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1555   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1556
1557   if (REG_P (dst) && REG_P (src))
1558     {
1559       int src_regno = REGNO (src);
1560       int dst_regno = REGNO (dst);
1561
1562       /* Handle FP <-> GP regs.  */
1563       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1564         {
1565           src_lo = gen_lowpart (word_mode, src);
1566           src_hi = gen_highpart (word_mode, src);
1567
1568           if (mode == TImode)
1569             {
1570               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1571               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1572             }
1573           else
1574             {
1575               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1576               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1577             }
1578           return;
1579         }
1580       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1581         {
1582           dst_lo = gen_lowpart (word_mode, dst);
1583           dst_hi = gen_highpart (word_mode, dst);
1584
1585           if (mode == TImode)
1586             {
1587               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1588               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1589             }
1590           else
1591             {
1592               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1593               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1594             }
1595           return;
1596         }
1597     }
1598
1599   dst_lo = gen_lowpart (word_mode, dst);
1600   dst_hi = gen_highpart (word_mode, dst);
1601   src_lo = gen_lowpart (word_mode, src);
1602   src_hi = gen_highpart_mode (word_mode, mode, src);
1603
1604   /* At most one pairing may overlap.  */
1605   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1606     {
1607       aarch64_emit_move (dst_hi, src_hi);
1608       aarch64_emit_move (dst_lo, src_lo);
1609     }
1610   else
1611     {
1612       aarch64_emit_move (dst_lo, src_lo);
1613       aarch64_emit_move (dst_hi, src_hi);
1614     }
1615 }
1616
1617 bool
1618 aarch64_split_128bit_move_p (rtx dst, rtx src)
1619 {
1620   return (! REG_P (src)
1621           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1622 }
1623
1624 /* Split a complex SIMD combine.  */
1625
1626 void
1627 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1628 {
1629   machine_mode src_mode = GET_MODE (src1);
1630   machine_mode dst_mode = GET_MODE (dst);
1631
1632   gcc_assert (VECTOR_MODE_P (dst_mode));
1633
1634   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1635     {
1636       rtx (*gen) (rtx, rtx, rtx);
1637
1638       switch (src_mode)
1639         {
1640         case V8QImode:
1641           gen = gen_aarch64_simd_combinev8qi;
1642           break;
1643         case V4HImode:
1644           gen = gen_aarch64_simd_combinev4hi;
1645           break;
1646         case V2SImode:
1647           gen = gen_aarch64_simd_combinev2si;
1648           break;
1649         case V4HFmode:
1650           gen = gen_aarch64_simd_combinev4hf;
1651           break;
1652         case V2SFmode:
1653           gen = gen_aarch64_simd_combinev2sf;
1654           break;
1655         case DImode:
1656           gen = gen_aarch64_simd_combinedi;
1657           break;
1658         case DFmode:
1659           gen = gen_aarch64_simd_combinedf;
1660           break;
1661         default:
1662           gcc_unreachable ();
1663         }
1664
1665       emit_insn (gen (dst, src1, src2));
1666       return;
1667     }
1668 }
1669
1670 /* Split a complex SIMD move.  */
1671
1672 void
1673 aarch64_split_simd_move (rtx dst, rtx src)
1674 {
1675   machine_mode src_mode = GET_MODE (src);
1676   machine_mode dst_mode = GET_MODE (dst);
1677
1678   gcc_assert (VECTOR_MODE_P (dst_mode));
1679
1680   if (REG_P (dst) && REG_P (src))
1681     {
1682       rtx (*gen) (rtx, rtx);
1683
1684       gcc_assert (VECTOR_MODE_P (src_mode));
1685
1686       switch (src_mode)
1687         {
1688         case V16QImode:
1689           gen = gen_aarch64_split_simd_movv16qi;
1690           break;
1691         case V8HImode:
1692           gen = gen_aarch64_split_simd_movv8hi;
1693           break;
1694         case V4SImode:
1695           gen = gen_aarch64_split_simd_movv4si;
1696           break;
1697         case V2DImode:
1698           gen = gen_aarch64_split_simd_movv2di;
1699           break;
1700         case V8HFmode:
1701           gen = gen_aarch64_split_simd_movv8hf;
1702           break;
1703         case V4SFmode:
1704           gen = gen_aarch64_split_simd_movv4sf;
1705           break;
1706         case V2DFmode:
1707           gen = gen_aarch64_split_simd_movv2df;
1708           break;
1709         default:
1710           gcc_unreachable ();
1711         }
1712
1713       emit_insn (gen (dst, src));
1714       return;
1715     }
1716 }
1717
1718 bool
1719 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1720                               machine_mode ymode, rtx y)
1721 {
1722   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1723   gcc_assert (r != NULL);
1724   return rtx_equal_p (x, r);
1725 }
1726
1727
1728 static rtx
1729 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1730 {
1731   if (can_create_pseudo_p ())
1732     return force_reg (mode, value);
1733   else
1734     {
1735       x = aarch64_emit_move (x, value);
1736       return x;
1737     }
1738 }
1739
1740
1741 static rtx
1742 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1743 {
1744   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1745     {
1746       rtx high;
1747       /* Load the full offset into a register.  This
1748          might be improvable in the future.  */
1749       high = GEN_INT (offset);
1750       offset = 0;
1751       high = aarch64_force_temporary (mode, temp, high);
1752       reg = aarch64_force_temporary (mode, temp,
1753                                      gen_rtx_PLUS (mode, high, reg));
1754     }
1755   return plus_constant (mode, reg, offset);
1756 }
1757
1758 static int
1759 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1760                                 machine_mode mode)
1761 {
1762   int i;
1763   unsigned HOST_WIDE_INT val, val2, mask;
1764   int one_match, zero_match;
1765   int num_insns;
1766
1767   val = INTVAL (imm);
1768
1769   if (aarch64_move_imm (val, mode))
1770     {
1771       if (generate)
1772         emit_insn (gen_rtx_SET (dest, imm));
1773       return 1;
1774     }
1775
1776   if ((val >> 32) == 0 || mode == SImode)
1777     {
1778       if (generate)
1779         {
1780           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1781           if (mode == SImode)
1782             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1783                                        GEN_INT ((val >> 16) & 0xffff)));
1784           else
1785             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1786                                        GEN_INT ((val >> 16) & 0xffff)));
1787         }
1788       return 2;
1789     }
1790
1791   /* Remaining cases are all for DImode.  */
1792
1793   mask = 0xffff;
1794   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1795     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1796   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1797     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1798
1799   if (zero_match != 2 && one_match != 2)
1800     {
1801       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1802          For a 64-bit bitmask try whether changing 16 bits to all ones or
1803          zeroes creates a valid bitmask.  To check any repeated bitmask,
1804          try using 16 bits from the other 32-bit half of val.  */
1805
1806       for (i = 0; i < 64; i += 16, mask <<= 16)
1807         {
1808           val2 = val & ~mask;
1809           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1810             break;
1811           val2 = val | mask;
1812           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1813             break;
1814           val2 = val2 & ~mask;
1815           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1816           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1817             break;
1818         }
1819       if (i != 64)
1820         {
1821           if (generate)
1822             {
1823               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1824               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1825                                          GEN_INT ((val >> i) & 0xffff)));
1826             }
1827           return 2;
1828         }
1829     }
1830
1831   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1832      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1833      otherwise skip zero bits.  */
1834
1835   num_insns = 1;
1836   mask = 0xffff;
1837   val2 = one_match > zero_match ? ~val : val;
1838   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1839
1840   if (generate)
1841     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1842                                            ? (val | ~(mask << i))
1843                                            : (val & (mask << i)))));
1844   for (i += 16; i < 64; i += 16)
1845     {
1846       if ((val2 & (mask << i)) == 0)
1847         continue;
1848       if (generate)
1849         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1850                                    GEN_INT ((val >> i) & 0xffff)));
1851       num_insns ++;
1852     }
1853
1854   return num_insns;
1855 }
1856
1857
1858 void
1859 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1860 {
1861   machine_mode mode = GET_MODE (dest);
1862
1863   gcc_assert (mode == SImode || mode == DImode);
1864
1865   /* Check on what type of symbol it is.  */
1866   if (GET_CODE (imm) == SYMBOL_REF
1867       || GET_CODE (imm) == LABEL_REF
1868       || GET_CODE (imm) == CONST)
1869     {
1870       rtx mem, base, offset;
1871       enum aarch64_symbol_type sty;
1872
1873       /* If we have (const (plus symbol offset)), separate out the offset
1874          before we start classifying the symbol.  */
1875       split_const (imm, &base, &offset);
1876
1877       sty = aarch64_classify_symbol (base, offset);
1878       switch (sty)
1879         {
1880         case SYMBOL_FORCE_TO_MEM:
1881           if (offset != const0_rtx
1882               && targetm.cannot_force_const_mem (mode, imm))
1883             {
1884               gcc_assert (can_create_pseudo_p ());
1885               base = aarch64_force_temporary (mode, dest, base);
1886               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1887               aarch64_emit_move (dest, base);
1888               return;
1889             }
1890
1891           mem = force_const_mem (ptr_mode, imm);
1892           gcc_assert (mem);
1893
1894           /* If we aren't generating PC relative literals, then
1895              we need to expand the literal pool access carefully.
1896              This is something that needs to be done in a number
1897              of places, so could well live as a separate function.  */
1898           if (!aarch64_pcrelative_literal_loads)
1899             {
1900               gcc_assert (can_create_pseudo_p ());
1901               base = gen_reg_rtx (ptr_mode);
1902               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1903               mem = gen_rtx_MEM (ptr_mode, base);
1904             }
1905
1906           if (mode != ptr_mode)
1907             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1908
1909           emit_insn (gen_rtx_SET (dest, mem));
1910
1911           return;
1912
1913         case SYMBOL_SMALL_TLSGD:
1914         case SYMBOL_SMALL_TLSDESC:
1915         case SYMBOL_SMALL_TLSIE:
1916         case SYMBOL_SMALL_GOT_28K:
1917         case SYMBOL_SMALL_GOT_4G:
1918         case SYMBOL_TINY_GOT:
1919         case SYMBOL_TINY_TLSIE:
1920           if (offset != const0_rtx)
1921             {
1922               gcc_assert(can_create_pseudo_p ());
1923               base = aarch64_force_temporary (mode, dest, base);
1924               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1925               aarch64_emit_move (dest, base);
1926               return;
1927             }
1928           /* FALLTHRU */
1929
1930         case SYMBOL_SMALL_ABSOLUTE:
1931         case SYMBOL_TINY_ABSOLUTE:
1932         case SYMBOL_TLSLE12:
1933         case SYMBOL_TLSLE24:
1934         case SYMBOL_TLSLE32:
1935         case SYMBOL_TLSLE48:
1936           aarch64_load_symref_appropriately (dest, imm, sty);
1937           return;
1938
1939         default:
1940           gcc_unreachable ();
1941         }
1942     }
1943
1944   if (!CONST_INT_P (imm))
1945     {
1946       if (GET_CODE (imm) == HIGH)
1947         emit_insn (gen_rtx_SET (dest, imm));
1948       else
1949         {
1950           rtx mem = force_const_mem (mode, imm);
1951           gcc_assert (mem);
1952           emit_insn (gen_rtx_SET (dest, mem));
1953         }
1954
1955       return;
1956     }
1957
1958   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1959 }
1960
1961 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to held
1962    intermediate value if necessary.
1963
1964    This function is sometimes used to adjust the stack pointer, so we must
1965    ensure that it can never cause transient stack deallocation by writing an
1966    invalid value into REGNUM.  */
1967
1968 static void
1969 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
1970                       HOST_WIDE_INT delta, bool frame_related_p)
1971 {
1972   HOST_WIDE_INT mdelta = abs_hwi (delta);
1973   rtx this_rtx = gen_rtx_REG (mode, regnum);
1974   rtx_insn *insn;
1975
1976   /* Do nothing if mdelta is zero.  */
1977   if (!mdelta)
1978     return;
1979
1980   /* We only need single instruction if the offset fit into add/sub.  */
1981   if (aarch64_uimm12_shift (mdelta))
1982     {
1983       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
1984       RTX_FRAME_RELATED_P (insn) = frame_related_p;
1985       return;
1986     }
1987
1988   /* We need two add/sub instructions, each one performing part of the
1989      calculation.  Don't do this if the addend can be loaded into register with
1990      a single instruction, in that case we prefer a move to a scratch register
1991      following by an addition.  */
1992   if (mdelta < 0x1000000 && !aarch64_move_imm (delta, mode))
1993     {
1994       HOST_WIDE_INT low_off = mdelta & 0xfff;
1995
1996       low_off = delta < 0 ? -low_off : low_off;
1997       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
1998       RTX_FRAME_RELATED_P (insn) = frame_related_p;
1999       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2000       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2001       return;
2002     }
2003
2004   /* Otherwise use generic function to handle all other situations.  */
2005   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2006   aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true, mode);
2007   insn = emit_insn (gen_add2_insn (this_rtx, scratch_rtx));
2008   if (frame_related_p)
2009     {
2010       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2011       rtx adj = plus_constant (mode, this_rtx, delta);
2012       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2013     }
2014 }
2015
2016 static bool
2017 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2018                                  tree exp ATTRIBUTE_UNUSED)
2019 {
2020   /* Currently, always true.  */
2021   return true;
2022 }
2023
2024 /* Implement TARGET_PASS_BY_REFERENCE.  */
2025
2026 static bool
2027 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2028                            machine_mode mode,
2029                            const_tree type,
2030                            bool named ATTRIBUTE_UNUSED)
2031 {
2032   HOST_WIDE_INT size;
2033   machine_mode dummymode;
2034   int nregs;
2035
2036   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2037   size = (mode == BLKmode && type)
2038     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2039
2040   /* Aggregates are passed by reference based on their size.  */
2041   if (type && AGGREGATE_TYPE_P (type))
2042     {
2043       size = int_size_in_bytes (type);
2044     }
2045
2046   /* Variable sized arguments are always returned by reference.  */
2047   if (size < 0)
2048     return true;
2049
2050   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2051   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2052                                                &dummymode, &nregs,
2053                                                NULL))
2054     return false;
2055
2056   /* Arguments which are variable sized or larger than 2 registers are
2057      passed by reference unless they are a homogenous floating point
2058      aggregate.  */
2059   return size > 2 * UNITS_PER_WORD;
2060 }
2061
2062 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2063 static bool
2064 aarch64_return_in_msb (const_tree valtype)
2065 {
2066   machine_mode dummy_mode;
2067   int dummy_int;
2068
2069   /* Never happens in little-endian mode.  */
2070   if (!BYTES_BIG_ENDIAN)
2071     return false;
2072
2073   /* Only composite types smaller than or equal to 16 bytes can
2074      be potentially returned in registers.  */
2075   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2076       || int_size_in_bytes (valtype) <= 0
2077       || int_size_in_bytes (valtype) > 16)
2078     return false;
2079
2080   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2081      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2082      is always passed/returned in the least significant bits of fp/simd
2083      register(s).  */
2084   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2085                                                &dummy_mode, &dummy_int, NULL))
2086     return false;
2087
2088   return true;
2089 }
2090
2091 /* Implement TARGET_FUNCTION_VALUE.
2092    Define how to find the value returned by a function.  */
2093
2094 static rtx
2095 aarch64_function_value (const_tree type, const_tree func,
2096                         bool outgoing ATTRIBUTE_UNUSED)
2097 {
2098   machine_mode mode;
2099   int unsignedp;
2100   int count;
2101   machine_mode ag_mode;
2102
2103   mode = TYPE_MODE (type);
2104   if (INTEGRAL_TYPE_P (type))
2105     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2106
2107   if (aarch64_return_in_msb (type))
2108     {
2109       HOST_WIDE_INT size = int_size_in_bytes (type);
2110
2111       if (size % UNITS_PER_WORD != 0)
2112         {
2113           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2114           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2115         }
2116     }
2117
2118   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2119                                                &ag_mode, &count, NULL))
2120     {
2121       if (!aarch64_composite_type_p (type, mode))
2122         {
2123           gcc_assert (count == 1 && mode == ag_mode);
2124           return gen_rtx_REG (mode, V0_REGNUM);
2125         }
2126       else
2127         {
2128           int i;
2129           rtx par;
2130
2131           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2132           for (i = 0; i < count; i++)
2133             {
2134               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2135               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2136                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2137               XVECEXP (par, 0, i) = tmp;
2138             }
2139           return par;
2140         }
2141     }
2142   else
2143     return gen_rtx_REG (mode, R0_REGNUM);
2144 }
2145
2146 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2147    Return true if REGNO is the number of a hard register in which the values
2148    of called function may come back.  */
2149
2150 static bool
2151 aarch64_function_value_regno_p (const unsigned int regno)
2152 {
2153   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2154      of 16-byte return values are: 128-bit integers and 16-byte small
2155      structures (excluding homogeneous floating-point aggregates).  */
2156   if (regno == R0_REGNUM || regno == R1_REGNUM)
2157     return true;
2158
2159   /* Up to four fp/simd registers can return a function value, e.g. a
2160      homogeneous floating-point aggregate having four members.  */
2161   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2162     return TARGET_FLOAT;
2163
2164   return false;
2165 }
2166
2167 /* Implement TARGET_RETURN_IN_MEMORY.
2168
2169    If the type T of the result of a function is such that
2170      void func (T arg)
2171    would require that arg be passed as a value in a register (or set of
2172    registers) according to the parameter passing rules, then the result
2173    is returned in the same registers as would be used for such an
2174    argument.  */
2175
2176 static bool
2177 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2178 {
2179   HOST_WIDE_INT size;
2180   machine_mode ag_mode;
2181   int count;
2182
2183   if (!AGGREGATE_TYPE_P (type)
2184       && TREE_CODE (type) != COMPLEX_TYPE
2185       && TREE_CODE (type) != VECTOR_TYPE)
2186     /* Simple scalar types always returned in registers.  */
2187     return false;
2188
2189   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2190                                                type,
2191                                                &ag_mode,
2192                                                &count,
2193                                                NULL))
2194     return false;
2195
2196   /* Types larger than 2 registers returned in memory.  */
2197   size = int_size_in_bytes (type);
2198   return (size < 0 || size > 2 * UNITS_PER_WORD);
2199 }
2200
2201 static bool
2202 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2203                                const_tree type, int *nregs)
2204 {
2205   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2206   return aarch64_vfp_is_call_or_return_candidate (mode,
2207                                                   type,
2208                                                   &pcum->aapcs_vfp_rmode,
2209                                                   nregs,
2210                                                   NULL);
2211 }
2212
2213 /* Given MODE and TYPE of a function argument, return the alignment in
2214    bits.  The idea is to suppress any stronger alignment requested by
2215    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2216    This is a helper function for local use only.  */
2217
2218 static unsigned int
2219 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2220 {
2221   if (!type)
2222     return GET_MODE_ALIGNMENT (mode);
2223   if (integer_zerop (TYPE_SIZE (type)))
2224     return 0;
2225
2226   gcc_assert (TYPE_MODE (type) == mode);
2227
2228   if (!AGGREGATE_TYPE_P (type))
2229     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2230
2231   if (TREE_CODE (type) == ARRAY_TYPE)
2232     return TYPE_ALIGN (TREE_TYPE (type));
2233
2234   unsigned int alignment = 0;
2235
2236   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2237     alignment = std::max (alignment, DECL_ALIGN (field));
2238
2239   return alignment;
2240 }
2241
2242 /* Layout a function argument according to the AAPCS64 rules.  The rule
2243    numbers refer to the rule numbers in the AAPCS64.  */
2244
2245 static void
2246 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2247                     const_tree type,
2248                     bool named ATTRIBUTE_UNUSED)
2249 {
2250   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2251   int ncrn, nvrn, nregs;
2252   bool allocate_ncrn, allocate_nvrn;
2253   HOST_WIDE_INT size;
2254
2255   /* We need to do this once per argument.  */
2256   if (pcum->aapcs_arg_processed)
2257     return;
2258
2259   pcum->aapcs_arg_processed = true;
2260
2261   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2262   size
2263     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2264                 UNITS_PER_WORD);
2265
2266   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2267   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2268                                                  mode,
2269                                                  type,
2270                                                  &nregs);
2271
2272   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2273      The following code thus handles passing by SIMD/FP registers first.  */
2274
2275   nvrn = pcum->aapcs_nvrn;
2276
2277   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2278      and homogenous short-vector aggregates (HVA).  */
2279   if (allocate_nvrn)
2280     {
2281       if (!TARGET_FLOAT)
2282         aarch64_err_no_fpadvsimd (mode, "argument");
2283
2284       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2285         {
2286           pcum->aapcs_nextnvrn = nvrn + nregs;
2287           if (!aarch64_composite_type_p (type, mode))
2288             {
2289               gcc_assert (nregs == 1);
2290               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2291             }
2292           else
2293             {
2294               rtx par;
2295               int i;
2296               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2297               for (i = 0; i < nregs; i++)
2298                 {
2299                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2300                                          V0_REGNUM + nvrn + i);
2301                   tmp = gen_rtx_EXPR_LIST
2302                     (VOIDmode, tmp,
2303                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2304                   XVECEXP (par, 0, i) = tmp;
2305                 }
2306               pcum->aapcs_reg = par;
2307             }
2308           return;
2309         }
2310       else
2311         {
2312           /* C.3 NSRN is set to 8.  */
2313           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2314           goto on_stack;
2315         }
2316     }
2317
2318   ncrn = pcum->aapcs_ncrn;
2319   nregs = size / UNITS_PER_WORD;
2320
2321   /* C6 - C9.  though the sign and zero extension semantics are
2322      handled elsewhere.  This is the case where the argument fits
2323      entirely general registers.  */
2324   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2325     {
2326       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2327
2328       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2329
2330       /* C.8 if the argument has an alignment of 16 then the NGRN is
2331          rounded up to the next even number.  */
2332       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2333         {
2334           ++ncrn;
2335           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2336         }
2337       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2338          A reg is still generated for it, but the caller should be smart
2339          enough not to use it.  */
2340       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2341         {
2342           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2343         }
2344       else
2345         {
2346           rtx par;
2347           int i;
2348
2349           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2350           for (i = 0; i < nregs; i++)
2351             {
2352               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2353               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2354                                        GEN_INT (i * UNITS_PER_WORD));
2355               XVECEXP (par, 0, i) = tmp;
2356             }
2357           pcum->aapcs_reg = par;
2358         }
2359
2360       pcum->aapcs_nextncrn = ncrn + nregs;
2361       return;
2362     }
2363
2364   /* C.11  */
2365   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2366
2367   /* The argument is passed on stack; record the needed number of words for
2368      this argument and align the total size if necessary.  */
2369 on_stack:
2370   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2371   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2372     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2373                                        16 / UNITS_PER_WORD);
2374   return;
2375 }
2376
2377 /* Implement TARGET_FUNCTION_ARG.  */
2378
2379 static rtx
2380 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2381                       const_tree type, bool named)
2382 {
2383   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2384   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2385
2386   if (mode == VOIDmode)
2387     return NULL_RTX;
2388
2389   aarch64_layout_arg (pcum_v, mode, type, named);
2390   return pcum->aapcs_reg;
2391 }
2392
2393 void
2394 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2395                            const_tree fntype ATTRIBUTE_UNUSED,
2396                            rtx libname ATTRIBUTE_UNUSED,
2397                            const_tree fndecl ATTRIBUTE_UNUSED,
2398                            unsigned n_named ATTRIBUTE_UNUSED)
2399 {
2400   pcum->aapcs_ncrn = 0;
2401   pcum->aapcs_nvrn = 0;
2402   pcum->aapcs_nextncrn = 0;
2403   pcum->aapcs_nextnvrn = 0;
2404   pcum->pcs_variant = ARM_PCS_AAPCS64;
2405   pcum->aapcs_reg = NULL_RTX;
2406   pcum->aapcs_arg_processed = false;
2407   pcum->aapcs_stack_words = 0;
2408   pcum->aapcs_stack_size = 0;
2409
2410   if (!TARGET_FLOAT
2411       && fndecl && TREE_PUBLIC (fndecl)
2412       && fntype && fntype != error_mark_node)
2413     {
2414       const_tree type = TREE_TYPE (fntype);
2415       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2416       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2417       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2418                                                    &mode, &nregs, NULL))
2419         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2420     }
2421   return;
2422 }
2423
2424 static void
2425 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2426                               machine_mode mode,
2427                               const_tree type,
2428                               bool named)
2429 {
2430   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2431   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2432     {
2433       aarch64_layout_arg (pcum_v, mode, type, named);
2434       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2435                   != (pcum->aapcs_stack_words != 0));
2436       pcum->aapcs_arg_processed = false;
2437       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2438       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2439       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2440       pcum->aapcs_stack_words = 0;
2441       pcum->aapcs_reg = NULL_RTX;
2442     }
2443 }
2444
2445 bool
2446 aarch64_function_arg_regno_p (unsigned regno)
2447 {
2448   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2449           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2450 }
2451
2452 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2453    PARM_BOUNDARY bits of alignment, but will be given anything up
2454    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2455    that both before and after the layout of each argument, the Next
2456    Stacked Argument Address (NSAA) will have a minimum alignment of
2457    8 bytes.  */
2458
2459 static unsigned int
2460 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2461 {
2462   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2463
2464   if (alignment < PARM_BOUNDARY)
2465     alignment = PARM_BOUNDARY;
2466   if (alignment > STACK_BOUNDARY)
2467     alignment = STACK_BOUNDARY;
2468   return alignment;
2469 }
2470
2471 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2472
2473    Return true if an argument passed on the stack should be padded upwards,
2474    i.e. if the least-significant byte of the stack slot has useful data.
2475
2476    Small aggregate types are placed in the lowest memory address.
2477
2478    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2479
2480 bool
2481 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2482 {
2483   /* On little-endian targets, the least significant byte of every stack
2484      argument is passed at the lowest byte address of the stack slot.  */
2485   if (!BYTES_BIG_ENDIAN)
2486     return true;
2487
2488   /* Otherwise, integral, floating-point and pointer types are padded downward:
2489      the least significant byte of a stack argument is passed at the highest
2490      byte address of the stack slot.  */
2491   if (type
2492       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2493          || POINTER_TYPE_P (type))
2494       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2495     return false;
2496
2497   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2498   return true;
2499 }
2500
2501 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2502
2503    It specifies padding for the last (may also be the only)
2504    element of a block move between registers and memory.  If
2505    assuming the block is in the memory, padding upward means that
2506    the last element is padded after its highest significant byte,
2507    while in downward padding, the last element is padded at the
2508    its least significant byte side.
2509
2510    Small aggregates and small complex types are always padded
2511    upwards.
2512
2513    We don't need to worry about homogeneous floating-point or
2514    short-vector aggregates; their move is not affected by the
2515    padding direction determined here.  Regardless of endianness,
2516    each element of such an aggregate is put in the least
2517    significant bits of a fp/simd register.
2518
2519    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2520    register has useful data, and return the opposite if the most
2521    significant byte does.  */
2522
2523 bool
2524 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2525                      bool first ATTRIBUTE_UNUSED)
2526 {
2527
2528   /* Small composite types are always padded upward.  */
2529   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2530     {
2531       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2532                             : GET_MODE_SIZE (mode));
2533       if (size < 2 * UNITS_PER_WORD)
2534         return true;
2535     }
2536
2537   /* Otherwise, use the default padding.  */
2538   return !BYTES_BIG_ENDIAN;
2539 }
2540
2541 static machine_mode
2542 aarch64_libgcc_cmp_return_mode (void)
2543 {
2544   return SImode;
2545 }
2546
2547 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2548
2549 /* We use the 12-bit shifted immediate arithmetic instructions so values
2550    must be multiple of (1 << 12), i.e. 4096.  */
2551 #define ARITH_FACTOR 4096
2552
2553 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2554 #error Cannot use simple address calculation for stack probing
2555 #endif
2556
2557 /* The pair of scratch registers used for stack probing.  */
2558 #define PROBE_STACK_FIRST_REG  9
2559 #define PROBE_STACK_SECOND_REG 10
2560
2561 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2562    inclusive.  These are offsets from the current stack pointer.  */
2563
2564 static void
2565 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2566 {
2567   rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2568
2569   /* See the same assertion on PROBE_INTERVAL above.  */
2570   gcc_assert ((first % ARITH_FACTOR) == 0);
2571
2572   /* See if we have a constant small number of probes to generate.  If so,
2573      that's the easy case.  */
2574   if (size <= PROBE_INTERVAL)
2575     {
2576       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2577
2578       emit_set_insn (reg1,
2579                      plus_constant (ptr_mode,
2580                                     stack_pointer_rtx, -(first + base)));
2581       emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2582     }
2583
2584   /* The run-time loop is made up of 8 insns in the generic case while the
2585      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2586   else if (size <= 4 * PROBE_INTERVAL)
2587     {
2588       HOST_WIDE_INT i, rem;
2589
2590       emit_set_insn (reg1,
2591                      plus_constant (ptr_mode,
2592                                     stack_pointer_rtx,
2593                                     -(first + PROBE_INTERVAL)));
2594       emit_stack_probe (reg1);
2595
2596       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2597          it exceeds SIZE.  If only two probes are needed, this will not
2598          generate any code.  Then probe at FIRST + SIZE.  */
2599       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2600         {
2601           emit_set_insn (reg1,
2602                          plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2603           emit_stack_probe (reg1);
2604         }
2605
2606       rem = size - (i - PROBE_INTERVAL);
2607       if (rem > 256)
2608         {
2609           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2610
2611           emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2612           emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2613         }
2614       else
2615         emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2616     }
2617
2618   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2619      extra careful with variables wrapping around because we might be at
2620      the very top (or the very bottom) of the address space and we have
2621      to be able to handle this case properly; in particular, we use an
2622      equality test for the loop condition.  */
2623   else
2624     {
2625       rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2626
2627       /* Step 1: round SIZE to the previous multiple of the interval.  */
2628
2629       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2630
2631
2632       /* Step 2: compute initial and final value of the loop counter.  */
2633
2634       /* TEST_ADDR = SP + FIRST.  */
2635       emit_set_insn (reg1,
2636                      plus_constant (ptr_mode, stack_pointer_rtx, -first));
2637
2638       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2639       emit_set_insn (reg2,
2640                      plus_constant (ptr_mode, stack_pointer_rtx,
2641                                     -(first + rounded_size)));
2642
2643
2644       /* Step 3: the loop
2645
2646          do
2647            {
2648              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2649              probe at TEST_ADDR
2650            }
2651          while (TEST_ADDR != LAST_ADDR)
2652
2653          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2654          until it is equal to ROUNDED_SIZE.  */
2655
2656       if (ptr_mode == DImode)
2657         emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2658       else
2659         emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2660
2661
2662       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2663          that SIZE is equal to ROUNDED_SIZE.  */
2664
2665       if (size != rounded_size)
2666         {
2667           HOST_WIDE_INT rem = size - rounded_size;
2668
2669           if (rem > 256)
2670             {
2671               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2672
2673               emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2674               emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2675             }
2676           else
2677             emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2678         }
2679     }
2680
2681   /* Make sure nothing is scheduled before we are done.  */
2682   emit_insn (gen_blockage ());
2683 }
2684
2685 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2686    absolute addresses.  */
2687
2688 const char *
2689 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2690 {
2691   static int labelno = 0;
2692   char loop_lab[32];
2693   rtx xops[2];
2694
2695   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2696
2697   /* Loop.  */
2698   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2699
2700   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2701   xops[0] = reg1;
2702   xops[1] = GEN_INT (PROBE_INTERVAL);
2703   output_asm_insn ("sub\t%0, %0, %1", xops);
2704
2705   /* Probe at TEST_ADDR.  */
2706   output_asm_insn ("str\txzr, [%0]", xops);
2707
2708   /* Test if TEST_ADDR == LAST_ADDR.  */
2709   xops[1] = reg2;
2710   output_asm_insn ("cmp\t%0, %1", xops);
2711
2712   /* Branch.  */
2713   fputs ("\tb.ne\t", asm_out_file);
2714   assemble_name_raw (asm_out_file, loop_lab);
2715   fputc ('\n', asm_out_file);
2716
2717   return "";
2718 }
2719
2720 static bool
2721 aarch64_frame_pointer_required (void)
2722 {
2723   /* In aarch64_override_options_after_change
2724      flag_omit_leaf_frame_pointer turns off the frame pointer by
2725      default.  Turn it back on now if we've not got a leaf
2726      function.  */
2727   if (flag_omit_leaf_frame_pointer
2728       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2729     return true;
2730
2731   return false;
2732 }
2733
2734 /* Mark the registers that need to be saved by the callee and calculate
2735    the size of the callee-saved registers area and frame record (both FP
2736    and LR may be omitted).  */
2737 static void
2738 aarch64_layout_frame (void)
2739 {
2740   HOST_WIDE_INT offset = 0;
2741   int regno;
2742
2743   if (reload_completed && cfun->machine->frame.laid_out)
2744     return;
2745
2746 #define SLOT_NOT_REQUIRED (-2)
2747 #define SLOT_REQUIRED     (-1)
2748
2749   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2750   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2751
2752   /* First mark all the registers that really need to be saved...  */
2753   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2754     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2755
2756   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2757     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2758
2759   /* ... that includes the eh data registers (if needed)...  */
2760   if (crtl->calls_eh_return)
2761     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2762       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2763         = SLOT_REQUIRED;
2764
2765   /* ... and any callee saved register that dataflow says is live.  */
2766   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2767     if (df_regs_ever_live_p (regno)
2768         && (regno == R30_REGNUM
2769             || !call_used_regs[regno]))
2770       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2771
2772   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2773     if (df_regs_ever_live_p (regno)
2774         && !call_used_regs[regno])
2775       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2776
2777   if (frame_pointer_needed)
2778     {
2779       /* FP and LR are placed in the linkage record.  */
2780       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2781       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2782       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2783       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2784       offset += 2 * UNITS_PER_WORD;
2785     }
2786
2787   /* Now assign stack slots for them.  */
2788   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2789     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2790       {
2791         cfun->machine->frame.reg_offset[regno] = offset;
2792         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2793           cfun->machine->frame.wb_candidate1 = regno;
2794         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2795           cfun->machine->frame.wb_candidate2 = regno;
2796         offset += UNITS_PER_WORD;
2797       }
2798
2799   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2800     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2801       {
2802         cfun->machine->frame.reg_offset[regno] = offset;
2803         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2804           cfun->machine->frame.wb_candidate1 = regno;
2805         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2806                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2807           cfun->machine->frame.wb_candidate2 = regno;
2808         offset += UNITS_PER_WORD;
2809       }
2810
2811   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2812
2813   cfun->machine->frame.saved_regs_size = offset;
2814
2815   HOST_WIDE_INT varargs_and_saved_regs_size
2816     = offset + cfun->machine->frame.saved_varargs_size;
2817
2818   cfun->machine->frame.hard_fp_offset
2819     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2820                 STACK_BOUNDARY / BITS_PER_UNIT);
2821
2822   cfun->machine->frame.frame_size
2823     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2824                 + crtl->outgoing_args_size,
2825                 STACK_BOUNDARY / BITS_PER_UNIT);
2826
2827   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2828
2829   cfun->machine->frame.initial_adjust = 0;
2830   cfun->machine->frame.final_adjust = 0;
2831   cfun->machine->frame.callee_adjust = 0;
2832   cfun->machine->frame.callee_offset = 0;
2833
2834   HOST_WIDE_INT max_push_offset = 0;
2835   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2836     max_push_offset = 512;
2837   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2838     max_push_offset = 256;
2839
2840   if (cfun->machine->frame.frame_size < max_push_offset
2841       && crtl->outgoing_args_size == 0)
2842     {
2843       /* Simple, small frame with no outgoing arguments:
2844          stp reg1, reg2, [sp, -frame_size]!
2845          stp reg3, reg4, [sp, 16]  */
2846       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2847     }
2848   else if ((crtl->outgoing_args_size
2849             + cfun->machine->frame.saved_regs_size < 512)
2850            && !(cfun->calls_alloca
2851                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2852     {
2853       /* Frame with small outgoing arguments:
2854          sub sp, sp, frame_size
2855          stp reg1, reg2, [sp, outgoing_args_size]
2856          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
2857       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2858       cfun->machine->frame.callee_offset
2859         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2860     }
2861   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2862     {
2863       /* Frame with large outgoing arguments but a small local area:
2864          stp reg1, reg2, [sp, -hard_fp_offset]!
2865          stp reg3, reg4, [sp, 16]
2866          sub sp, sp, outgoing_args_size  */
2867       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2868       cfun->machine->frame.final_adjust
2869         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2870     }
2871   else if (!frame_pointer_needed
2872            && varargs_and_saved_regs_size < max_push_offset)
2873     {
2874       /* Frame with large local area and outgoing arguments (this pushes the
2875          callee-saves first, followed by the locals and outgoing area):
2876          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2877          stp reg3, reg4, [sp, 16]
2878          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
2879       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2880       cfun->machine->frame.final_adjust
2881         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2882       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2883       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2884     }
2885   else
2886     {
2887       /* Frame with large local area and outgoing arguments using frame pointer:
2888          sub sp, sp, hard_fp_offset
2889          stp x29, x30, [sp, 0]
2890          add x29, sp, 0
2891          stp reg3, reg4, [sp, 16]
2892          sub sp, sp, outgoing_args_size  */
2893       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2894       cfun->machine->frame.final_adjust
2895         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2896     }
2897
2898   cfun->machine->frame.laid_out = true;
2899 }
2900
2901 static bool
2902 aarch64_register_saved_on_entry (int regno)
2903 {
2904   return cfun->machine->frame.reg_offset[regno] >= 0;
2905 }
2906
2907 static unsigned
2908 aarch64_next_callee_save (unsigned regno, unsigned limit)
2909 {
2910   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2911     regno ++;
2912   return regno;
2913 }
2914
2915 static void
2916 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2917                            HOST_WIDE_INT adjustment)
2918  {
2919   rtx base_rtx = stack_pointer_rtx;
2920   rtx insn, reg, mem;
2921
2922   reg = gen_rtx_REG (mode, regno);
2923   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2924                             plus_constant (Pmode, base_rtx, -adjustment));
2925   mem = gen_rtx_MEM (mode, mem);
2926
2927   insn = emit_move_insn (mem, reg);
2928   RTX_FRAME_RELATED_P (insn) = 1;
2929 }
2930
2931 static rtx
2932 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2933                           HOST_WIDE_INT adjustment)
2934 {
2935   switch (mode)
2936     {
2937     case DImode:
2938       return gen_storewb_pairdi_di (base, base, reg, reg2,
2939                                     GEN_INT (-adjustment),
2940                                     GEN_INT (UNITS_PER_WORD - adjustment));
2941     case DFmode:
2942       return gen_storewb_pairdf_di (base, base, reg, reg2,
2943                                     GEN_INT (-adjustment),
2944                                     GEN_INT (UNITS_PER_WORD - adjustment));
2945     default:
2946       gcc_unreachable ();
2947     }
2948 }
2949
2950 static void
2951 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
2952 {
2953   rtx_insn *insn;
2954   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
2955
2956   if (regno2 == INVALID_REGNUM)
2957     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
2958
2959   rtx reg1 = gen_rtx_REG (mode, regno1);
2960   rtx reg2 = gen_rtx_REG (mode, regno2);
2961
2962   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2963                                               reg2, adjustment));
2964   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2965   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2966   RTX_FRAME_RELATED_P (insn) = 1;
2967 }
2968
2969 static rtx
2970 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2971                          HOST_WIDE_INT adjustment)
2972 {
2973   switch (mode)
2974     {
2975     case DImode:
2976       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2977                                    GEN_INT (UNITS_PER_WORD));
2978     case DFmode:
2979       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2980                                    GEN_INT (UNITS_PER_WORD));
2981     default:
2982       gcc_unreachable ();
2983     }
2984 }
2985
2986 static void
2987 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
2988                   rtx *cfi_ops)
2989 {
2990   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
2991   rtx reg1 = gen_rtx_REG (mode, regno1);
2992
2993   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
2994
2995   if (regno2 == INVALID_REGNUM)
2996     {
2997       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
2998       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2999       emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3000     }
3001   else
3002     {
3003       rtx reg2 = gen_rtx_REG (mode, regno2);
3004       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3005       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3006                                           reg2, adjustment));
3007     }
3008 }
3009
3010 static rtx
3011 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3012                         rtx reg2)
3013 {
3014   switch (mode)
3015     {
3016     case DImode:
3017       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3018
3019     case DFmode:
3020       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3021
3022     default:
3023       gcc_unreachable ();
3024     }
3025 }
3026
3027 static rtx
3028 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3029                        rtx mem2)
3030 {
3031   switch (mode)
3032     {
3033     case DImode:
3034       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3035
3036     case DFmode:
3037       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3038
3039     default:
3040       gcc_unreachable ();
3041     }
3042 }
3043
3044
3045 static void
3046 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3047                            unsigned start, unsigned limit, bool skip_wb)
3048 {
3049   rtx_insn *insn;
3050   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3051                                                  ? gen_frame_mem : gen_rtx_MEM);
3052   unsigned regno;
3053   unsigned regno2;
3054
3055   for (regno = aarch64_next_callee_save (start, limit);
3056        regno <= limit;
3057        regno = aarch64_next_callee_save (regno + 1, limit))
3058     {
3059       rtx reg, mem;
3060       HOST_WIDE_INT offset;
3061
3062       if (skip_wb
3063           && (regno == cfun->machine->frame.wb_candidate1
3064               || regno == cfun->machine->frame.wb_candidate2))
3065         continue;
3066
3067       reg = gen_rtx_REG (mode, regno);
3068       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3069       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3070                                               offset));
3071
3072       regno2 = aarch64_next_callee_save (regno + 1, limit);
3073
3074       if (regno2 <= limit
3075           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3076               == cfun->machine->frame.reg_offset[regno2]))
3077
3078         {
3079           rtx reg2 = gen_rtx_REG (mode, regno2);
3080           rtx mem2;
3081
3082           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3083           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3084                                                    offset));
3085           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3086                                                     reg2));
3087
3088           /* The first part of a frame-related parallel insn is
3089              always assumed to be relevant to the frame
3090              calculations; subsequent parts, are only
3091              frame-related if explicitly marked.  */
3092           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3093           regno = regno2;
3094         }
3095       else
3096         insn = emit_move_insn (mem, reg);
3097
3098       RTX_FRAME_RELATED_P (insn) = 1;
3099     }
3100 }
3101
3102 static void
3103 aarch64_restore_callee_saves (machine_mode mode,
3104                               HOST_WIDE_INT start_offset, unsigned start,
3105                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3106 {
3107   rtx base_rtx = stack_pointer_rtx;
3108   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3109                                                  ? gen_frame_mem : gen_rtx_MEM);
3110   unsigned regno;
3111   unsigned regno2;
3112   HOST_WIDE_INT offset;
3113
3114   for (regno = aarch64_next_callee_save (start, limit);
3115        regno <= limit;
3116        regno = aarch64_next_callee_save (regno + 1, limit))
3117     {
3118       rtx reg, mem;
3119
3120       if (skip_wb
3121           && (regno == cfun->machine->frame.wb_candidate1
3122               || regno == cfun->machine->frame.wb_candidate2))
3123         continue;
3124
3125       reg = gen_rtx_REG (mode, regno);
3126       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3127       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3128
3129       regno2 = aarch64_next_callee_save (regno + 1, limit);
3130
3131       if (regno2 <= limit
3132           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3133               == cfun->machine->frame.reg_offset[regno2]))
3134         {
3135           rtx reg2 = gen_rtx_REG (mode, regno2);
3136           rtx mem2;
3137
3138           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3139           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3140           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3141
3142           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3143           regno = regno2;
3144         }
3145       else
3146         emit_move_insn (reg, mem);
3147       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3148     }
3149 }
3150
3151 /* AArch64 stack frames generated by this compiler look like:
3152
3153         +-------------------------------+
3154         |                               |
3155         |  incoming stack arguments     |
3156         |                               |
3157         +-------------------------------+
3158         |                               | <-- incoming stack pointer (aligned)
3159         |  callee-allocated save area   |
3160         |  for register varargs         |
3161         |                               |
3162         +-------------------------------+
3163         |  local variables              | <-- frame_pointer_rtx
3164         |                               |
3165         +-------------------------------+
3166         |  padding0                     | \
3167         +-------------------------------+  |
3168         |  callee-saved registers       |  | frame.saved_regs_size
3169         +-------------------------------+  |
3170         |  LR'                          |  |
3171         +-------------------------------+  |
3172         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3173         +-------------------------------+
3174         |  dynamic allocation           |
3175         +-------------------------------+
3176         |  padding                      |
3177         +-------------------------------+
3178         |  outgoing stack arguments     | <-- arg_pointer
3179         |                               |
3180         +-------------------------------+
3181         |                               | <-- stack_pointer_rtx (aligned)
3182
3183    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3184    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3185    unchanged.  */
3186
3187 /* Generate the prologue instructions for entry into a function.
3188    Establish the stack frame by decreasing the stack pointer with a
3189    properly calculated size and, if necessary, create a frame record
3190    filled with the values of LR and previous frame pointer.  The
3191    current FP is also set up if it is in use.  */
3192
3193 void
3194 aarch64_expand_prologue (void)
3195 {
3196   aarch64_layout_frame ();
3197
3198   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3199   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3200   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3201   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3202   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3203   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3204   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3205   rtx_insn *insn;
3206
3207   if (flag_stack_usage_info)
3208     current_function_static_stack_size = frame_size;
3209
3210   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3211     {
3212       if (crtl->is_leaf && !cfun->calls_alloca)
3213         {
3214           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3215             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3216                                             frame_size - STACK_CHECK_PROTECT);
3217         }
3218       else if (frame_size > 0)
3219         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3220     }
3221
3222   aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -initial_adjust, true);
3223
3224   if (callee_adjust != 0)
3225     aarch64_push_regs (reg1, reg2, callee_adjust);
3226
3227   if (frame_pointer_needed)
3228     {
3229       if (callee_adjust == 0)
3230         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3231                                    R30_REGNUM, false);
3232       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3233                                        stack_pointer_rtx,
3234                                        GEN_INT (callee_offset)));
3235       RTX_FRAME_RELATED_P (insn) = 1;
3236       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3237     }
3238
3239   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3240                              callee_adjust != 0 || frame_pointer_needed);
3241   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3242                              callee_adjust != 0 || frame_pointer_needed);
3243   aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, -final_adjust,
3244                         !frame_pointer_needed);
3245 }
3246
3247 /* Return TRUE if we can use a simple_return insn.
3248
3249    This function checks whether the callee saved stack is empty, which
3250    means no restore actions are need. The pro_and_epilogue will use
3251    this to check whether shrink-wrapping opt is feasible.  */
3252
3253 bool
3254 aarch64_use_return_insn_p (void)
3255 {
3256   if (!reload_completed)
3257     return false;
3258
3259   if (crtl->profile)
3260     return false;
3261
3262   aarch64_layout_frame ();
3263
3264   return cfun->machine->frame.frame_size == 0;
3265 }
3266
3267 /* Generate the epilogue instructions for returning from a function.
3268    This is almost exactly the reverse of the prolog sequence, except
3269    that we need to insert barriers to avoid scheduling loads that read
3270    from a deallocated stack, and we optimize the unwind records by
3271    emitting them all together if possible.  */
3272 void
3273 aarch64_expand_epilogue (bool for_sibcall)
3274 {
3275   aarch64_layout_frame ();
3276
3277   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3278   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3279   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3280   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3281   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3282   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3283   rtx cfi_ops = NULL;
3284   rtx_insn *insn;
3285
3286   /* We need to add memory barrier to prevent read from deallocated stack.  */
3287   bool need_barrier_p = (get_frame_size ()
3288                          + cfun->machine->frame.saved_varargs_size) != 0;
3289
3290   /* Emit a barrier to prevent loads from a deallocated stack.  */
3291   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca)
3292     {
3293       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3294       need_barrier_p = false;
3295     }
3296
3297   /* Restore the stack pointer from the frame pointer if it may not
3298      be the same as the stack pointer.  */
3299   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3300     {
3301       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3302                                        hard_frame_pointer_rtx,
3303                                        GEN_INT (-callee_offset)));
3304       /* If writeback is used when restoring callee-saves, the CFA
3305          is restored on the instruction doing the writeback.  */
3306       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3307     }
3308   else
3309     aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, final_adjust, true);
3310
3311   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3312                                 callee_adjust != 0, &cfi_ops);
3313   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3314                                 callee_adjust != 0, &cfi_ops);
3315
3316   if (need_barrier_p)
3317     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3318
3319   if (callee_adjust != 0)
3320     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3321
3322   if (callee_adjust != 0 || initial_adjust > 65536)
3323     {
3324       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3325       insn = get_last_insn ();
3326       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3327       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3328       RTX_FRAME_RELATED_P (insn) = 1;
3329       cfi_ops = NULL;
3330     }
3331
3332   aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, initial_adjust, true);
3333
3334   if (cfi_ops)
3335     {
3336       /* Emit delayed restores and reset the CFA to be SP.  */
3337       insn = get_last_insn ();
3338       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3339       REG_NOTES (insn) = cfi_ops;
3340       RTX_FRAME_RELATED_P (insn) = 1;
3341     }
3342
3343   /* Stack adjustment for exception handler.  */
3344   if (crtl->calls_eh_return)
3345     {
3346       /* We need to unwind the stack by the offset computed by
3347          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3348          to be SP; letting the CFA move during this adjustment
3349          is just as correct as retaining the CFA from the body
3350          of the function.  Therefore, do nothing special.  */
3351       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3352     }
3353
3354   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3355   if (!for_sibcall)
3356     emit_jump_insn (ret_rtx);
3357 }
3358
3359 /* Return the place to copy the exception unwinding return address to.
3360    This will probably be a stack slot, but could (in theory be the
3361    return register).  */
3362 rtx
3363 aarch64_final_eh_return_addr (void)
3364 {
3365   HOST_WIDE_INT fp_offset;
3366
3367   aarch64_layout_frame ();
3368
3369   fp_offset = cfun->machine->frame.frame_size
3370               - cfun->machine->frame.hard_fp_offset;
3371
3372   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
3373     return gen_rtx_REG (DImode, LR_REGNUM);
3374
3375   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
3376      result in a store to save LR introduced by builtin_eh_return () being
3377      incorrectly deleted because the alias is not detected.
3378      So in the calculation of the address to copy the exception unwinding
3379      return address to, we note 2 cases.
3380      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
3381      we return a SP-relative location since all the addresses are SP-relative
3382      in this case.  This prevents the store from being optimized away.
3383      If the fp_offset is not 0, then the addresses will be FP-relative and
3384      therefore we return a FP-relative location.  */
3385
3386   if (frame_pointer_needed)
3387     {
3388       if (fp_offset)
3389         return gen_frame_mem (DImode,
3390                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3391       else
3392         return gen_frame_mem (DImode,
3393                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
3394     }
3395
3396   /* If FP is not needed, we calculate the location of LR, which would be
3397      at the top of the saved registers block.  */
3398
3399   return gen_frame_mem (DImode,
3400                         plus_constant (Pmode,
3401                                        stack_pointer_rtx,
3402                                        fp_offset
3403                                        + cfun->machine->frame.saved_regs_size
3404                                        - 2 * UNITS_PER_WORD));
3405 }
3406
3407 /* Output code to add DELTA to the first argument, and then jump
3408    to FUNCTION.  Used for C++ multiple inheritance.  */
3409 static void
3410 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3411                          HOST_WIDE_INT delta,
3412                          HOST_WIDE_INT vcall_offset,
3413                          tree function)
3414 {
3415   /* The this pointer is always in x0.  Note that this differs from
3416      Arm where the this pointer maybe bumped to r1 if r0 is required
3417      to return a pointer to an aggregate.  On AArch64 a result value
3418      pointer will be in x8.  */
3419   int this_regno = R0_REGNUM;
3420   rtx this_rtx, temp0, temp1, addr, funexp;
3421   rtx_insn *insn;
3422
3423   reload_completed = 1;
3424   emit_note (NOTE_INSN_PROLOGUE_END);
3425
3426   if (vcall_offset == 0)
3427     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta, false);
3428   else
3429     {
3430       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3431
3432       this_rtx = gen_rtx_REG (Pmode, this_regno);
3433       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3434       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3435
3436       addr = this_rtx;
3437       if (delta != 0)
3438         {
3439           if (delta >= -256 && delta < 256)
3440             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3441                                        plus_constant (Pmode, this_rtx, delta));
3442           else
3443             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta, false);
3444         }
3445
3446       if (Pmode == ptr_mode)
3447         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3448       else
3449         aarch64_emit_move (temp0,
3450                            gen_rtx_ZERO_EXTEND (Pmode,
3451                                                 gen_rtx_MEM (ptr_mode, addr)));
3452
3453       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3454           addr = plus_constant (Pmode, temp0, vcall_offset);
3455       else
3456         {
3457           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3458                                           Pmode);
3459           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3460         }
3461
3462       if (Pmode == ptr_mode)
3463         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3464       else
3465         aarch64_emit_move (temp1,
3466                            gen_rtx_SIGN_EXTEND (Pmode,
3467                                                 gen_rtx_MEM (ptr_mode, addr)));
3468
3469       emit_insn (gen_add2_insn (this_rtx, temp1));
3470     }
3471
3472   /* Generate a tail call to the target function.  */
3473   if (!TREE_USED (function))
3474     {
3475       assemble_external (function);
3476       TREE_USED (function) = 1;
3477     }
3478   funexp = XEXP (DECL_RTL (function), 0);
3479   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3480   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3481   SIBLING_CALL_P (insn) = 1;
3482
3483   insn = get_insns ();
3484   shorten_branches (insn);
3485   final_start_function (insn, file, 1);
3486   final (insn, file, 1);
3487   final_end_function ();
3488
3489   /* Stop pretending to be a post-reload pass.  */
3490   reload_completed = 0;
3491 }
3492
3493 static bool
3494 aarch64_tls_referenced_p (rtx x)
3495 {
3496   if (!TARGET_HAVE_TLS)
3497     return false;
3498   subrtx_iterator::array_type array;
3499   FOR_EACH_SUBRTX (iter, array, x, ALL)
3500     {
3501       const_rtx x = *iter;
3502       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3503         return true;
3504       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3505          TLS offsets, not real symbol references.  */
3506       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3507         iter.skip_subrtxes ();
3508     }
3509   return false;
3510 }
3511
3512
3513 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3514    a left shift of 0 or 12 bits.  */
3515 bool
3516 aarch64_uimm12_shift (HOST_WIDE_INT val)
3517 {
3518   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3519           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3520           );
3521 }
3522
3523
3524 /* Return true if val is an immediate that can be loaded into a
3525    register by a MOVZ instruction.  */
3526 static bool
3527 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3528 {
3529   if (GET_MODE_SIZE (mode) > 4)
3530     {
3531       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3532           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3533         return 1;
3534     }
3535   else
3536     {
3537       /* Ignore sign extension.  */
3538       val &= (HOST_WIDE_INT) 0xffffffff;
3539     }
3540   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3541           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3542 }
3543
3544 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3545
3546 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3547   {
3548     0x0000000100000001ull,
3549     0x0001000100010001ull,
3550     0x0101010101010101ull,
3551     0x1111111111111111ull,
3552     0x5555555555555555ull,
3553   };
3554
3555
3556 /* Return true if val is a valid bitmask immediate.  */
3557
3558 bool
3559 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3560 {
3561   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3562   int bits;
3563
3564   /* Check for a single sequence of one bits and return quickly if so.
3565      The special cases of all ones and all zeroes returns false.  */
3566   val = (unsigned HOST_WIDE_INT) val_in;
3567   tmp = val + (val & -val);
3568
3569   if (tmp == (tmp & -tmp))
3570     return (val + 1) > 1;
3571
3572   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
3573   if (mode == SImode)
3574     val = (val << 32) | (val & 0xffffffff);
3575
3576   /* Invert if the immediate doesn't start with a zero bit - this means we
3577      only need to search for sequences of one bits.  */
3578   if (val & 1)
3579     val = ~val;
3580
3581   /* Find the first set bit and set tmp to val with the first sequence of one
3582      bits removed.  Return success if there is a single sequence of ones.  */
3583   first_one = val & -val;
3584   tmp = val & (val + first_one);
3585
3586   if (tmp == 0)
3587     return true;
3588
3589   /* Find the next set bit and compute the difference in bit position.  */
3590   next_one = tmp & -tmp;
3591   bits = clz_hwi (first_one) - clz_hwi (next_one);
3592   mask = val ^ tmp;
3593
3594   /* Check the bit position difference is a power of 2, and that the first
3595      sequence of one bits fits within 'bits' bits.  */
3596   if ((mask >> bits) != 0 || bits != (bits & -bits))
3597     return false;
3598
3599   /* Check the sequence of one bits is repeated 64/bits times.  */
3600   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3601 }
3602
3603
3604 /* Return true if val is an immediate that can be loaded into a
3605    register in a single instruction.  */
3606 bool
3607 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3608 {
3609   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3610     return 1;
3611   return aarch64_bitmask_imm (val, mode);
3612 }
3613
3614 static bool
3615 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3616 {
3617   rtx base, offset;
3618
3619   if (GET_CODE (x) == HIGH)
3620     return true;
3621
3622   split_const (x, &base, &offset);
3623   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3624     {
3625       if (aarch64_classify_symbol (base, offset)
3626           != SYMBOL_FORCE_TO_MEM)
3627         return true;
3628       else
3629         /* Avoid generating a 64-bit relocation in ILP32; leave
3630            to aarch64_expand_mov_immediate to handle it properly.  */
3631         return mode != ptr_mode;
3632     }
3633
3634   return aarch64_tls_referenced_p (x);
3635 }
3636
3637 /* Implement TARGET_CASE_VALUES_THRESHOLD.
3638    The expansion for a table switch is quite expensive due to the number
3639    of instructions, the table lookup and hard to predict indirect jump.
3640    When optimizing for speed, and -O3 enabled, use the per-core tuning if
3641    set, otherwise use tables for > 16 cases as a tradeoff between size and
3642    performance.  When optimizing for size, use the default setting.  */
3643
3644 static unsigned int
3645 aarch64_case_values_threshold (void)
3646 {
3647   /* Use the specified limit for the number of cases before using jump
3648      tables at higher optimization levels.  */
3649   if (optimize > 2
3650       && selected_cpu->tune->max_case_values != 0)
3651     return selected_cpu->tune->max_case_values;
3652   else
3653     return optimize_size ? default_case_values_threshold () : 17;
3654 }
3655
3656 /* Return true if register REGNO is a valid index register.
3657    STRICT_P is true if REG_OK_STRICT is in effect.  */
3658
3659 bool
3660 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3661 {
3662   if (!HARD_REGISTER_NUM_P (regno))
3663     {
3664       if (!strict_p)
3665         return true;
3666
3667       if (!reg_renumber)
3668         return false;
3669
3670       regno = reg_renumber[regno];
3671     }
3672   return GP_REGNUM_P (regno);
3673 }
3674
3675 /* Return true if register REGNO is a valid base register for mode MODE.
3676    STRICT_P is true if REG_OK_STRICT is in effect.  */
3677
3678 bool
3679 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3680 {
3681   if (!HARD_REGISTER_NUM_P (regno))
3682     {
3683       if (!strict_p)
3684         return true;
3685
3686       if (!reg_renumber)
3687         return false;
3688
3689       regno = reg_renumber[regno];
3690     }
3691
3692   /* The fake registers will be eliminated to either the stack or
3693      hard frame pointer, both of which are usually valid base registers.
3694      Reload deals with the cases where the eliminated form isn't valid.  */
3695   return (GP_REGNUM_P (regno)
3696           || regno == SP_REGNUM
3697           || regno == FRAME_POINTER_REGNUM
3698           || regno == ARG_POINTER_REGNUM);
3699 }
3700
3701 /* Return true if X is a valid base register for mode MODE.
3702    STRICT_P is true if REG_OK_STRICT is in effect.  */
3703
3704 static bool
3705 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3706 {
3707   if (!strict_p && GET_CODE (x) == SUBREG)
3708     x = SUBREG_REG (x);
3709
3710   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3711 }
3712
3713 /* Return true if address offset is a valid index.  If it is, fill in INFO
3714    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3715
3716 static bool
3717 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3718                         machine_mode mode, bool strict_p)
3719 {
3720   enum aarch64_address_type type;
3721   rtx index;
3722   int shift;
3723
3724   /* (reg:P) */
3725   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3726       && GET_MODE (x) == Pmode)
3727     {
3728       type = ADDRESS_REG_REG;
3729       index = x;
3730       shift = 0;
3731     }
3732   /* (sign_extend:DI (reg:SI)) */
3733   else if ((GET_CODE (x) == SIGN_EXTEND
3734             || GET_CODE (x) == ZERO_EXTEND)
3735            && GET_MODE (x) == DImode
3736            && GET_MODE (XEXP (x, 0)) == SImode)
3737     {
3738       type = (GET_CODE (x) == SIGN_EXTEND)
3739         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3740       index = XEXP (x, 0);
3741       shift = 0;
3742     }
3743   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3744   else if (GET_CODE (x) == MULT
3745            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3746                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3747            && GET_MODE (XEXP (x, 0)) == DImode
3748            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3749            && CONST_INT_P (XEXP (x, 1)))
3750     {
3751       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3752         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3753       index = XEXP (XEXP (x, 0), 0);
3754       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3755     }
3756   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3757   else if (GET_CODE (x) == ASHIFT
3758            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3759                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3760            && GET_MODE (XEXP (x, 0)) == DImode
3761            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3762            && CONST_INT_P (XEXP (x, 1)))
3763     {
3764       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3765         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3766       index = XEXP (XEXP (x, 0), 0);
3767       shift = INTVAL (XEXP (x, 1));
3768     }
3769   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3770   else if ((GET_CODE (x) == SIGN_EXTRACT
3771             || GET_CODE (x) == ZERO_EXTRACT)
3772            && GET_MODE (x) == DImode
3773            && GET_CODE (XEXP (x, 0)) == MULT
3774            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3775            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3776     {
3777       type = (GET_CODE (x) == SIGN_EXTRACT)
3778         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3779       index = XEXP (XEXP (x, 0), 0);
3780       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3781       if (INTVAL (XEXP (x, 1)) != 32 + shift
3782           || INTVAL (XEXP (x, 2)) != 0)
3783         shift = -1;
3784     }
3785   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3786      (const_int 0xffffffff<<shift)) */
3787   else if (GET_CODE (x) == AND
3788            && GET_MODE (x) == DImode
3789            && GET_CODE (XEXP (x, 0)) == MULT
3790            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3791            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3792            && CONST_INT_P (XEXP (x, 1)))
3793     {
3794       type = ADDRESS_REG_UXTW;
3795       index = XEXP (XEXP (x, 0), 0);
3796       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3797       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3798         shift = -1;
3799     }
3800   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3801   else if ((GET_CODE (x) == SIGN_EXTRACT
3802             || GET_CODE (x) == ZERO_EXTRACT)
3803            && GET_MODE (x) == DImode
3804            && GET_CODE (XEXP (x, 0)) == ASHIFT
3805            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3806            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3807     {
3808       type = (GET_CODE (x) == SIGN_EXTRACT)
3809         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3810       index = XEXP (XEXP (x, 0), 0);
3811       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3812       if (INTVAL (XEXP (x, 1)) != 32 + shift
3813           || INTVAL (XEXP (x, 2)) != 0)
3814         shift = -1;
3815     }
3816   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3817      (const_int 0xffffffff<<shift)) */
3818   else if (GET_CODE (x) == AND
3819            && GET_MODE (x) == DImode
3820            && GET_CODE (XEXP (x, 0)) == ASHIFT
3821            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3822            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3823            && CONST_INT_P (XEXP (x, 1)))
3824     {
3825       type = ADDRESS_REG_UXTW;
3826       index = XEXP (XEXP (x, 0), 0);
3827       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3828       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3829         shift = -1;
3830     }
3831   /* (mult:P (reg:P) (const_int scale)) */
3832   else if (GET_CODE (x) == MULT
3833            && GET_MODE (x) == Pmode
3834            && GET_MODE (XEXP (x, 0)) == Pmode
3835            && CONST_INT_P (XEXP (x, 1)))
3836     {
3837       type = ADDRESS_REG_REG;
3838       index = XEXP (x, 0);
3839       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3840     }
3841   /* (ashift:P (reg:P) (const_int shift)) */
3842   else if (GET_CODE (x) == ASHIFT
3843            && GET_MODE (x) == Pmode
3844            && GET_MODE (XEXP (x, 0)) == Pmode
3845            && CONST_INT_P (XEXP (x, 1)))
3846     {
3847       type = ADDRESS_REG_REG;
3848       index = XEXP (x, 0);
3849       shift = INTVAL (XEXP (x, 1));
3850     }
3851   else
3852     return false;
3853
3854   if (GET_CODE (index) == SUBREG)
3855     index = SUBREG_REG (index);
3856
3857   if ((shift == 0 ||
3858        (shift > 0 && shift <= 3
3859         && (1 << shift) == GET_MODE_SIZE (mode)))
3860       && REG_P (index)
3861       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3862     {
3863       info->type = type;
3864       info->offset = index;
3865       info->shift = shift;
3866       return true;
3867     }
3868
3869   return false;
3870 }
3871
3872 bool
3873 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3874 {
3875   return (offset >= -64 * GET_MODE_SIZE (mode)
3876           && offset < 64 * GET_MODE_SIZE (mode)
3877           && offset % GET_MODE_SIZE (mode) == 0);
3878 }
3879
3880 static inline bool
3881 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3882                                HOST_WIDE_INT offset)
3883 {
3884   return offset >= -256 && offset < 256;
3885 }
3886
3887 static inline bool
3888 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3889 {
3890   return (offset >= 0
3891           && offset < 4096 * GET_MODE_SIZE (mode)
3892           && offset % GET_MODE_SIZE (mode) == 0);
3893 }
3894
3895 /* Return true if MODE is one of the modes for which we
3896    support LDP/STP operations.  */
3897
3898 static bool
3899 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
3900 {
3901   return mode == SImode || mode == DImode
3902          || mode == SFmode || mode == DFmode
3903          || (aarch64_vector_mode_supported_p (mode)
3904              && GET_MODE_SIZE (mode) == 8);
3905 }
3906
3907 /* Return true if REGNO is a virtual pointer register, or an eliminable
3908    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
3909    include stack_pointer or hard_frame_pointer.  */
3910 static bool
3911 virt_or_elim_regno_p (unsigned regno)
3912 {
3913   return ((regno >= FIRST_VIRTUAL_REGISTER
3914            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
3915           || regno == FRAME_POINTER_REGNUM
3916           || regno == ARG_POINTER_REGNUM);
3917 }
3918
3919 /* Return true if X is a valid address for machine mode MODE.  If it is,
3920    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3921    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3922
3923 static bool
3924 aarch64_classify_address (struct aarch64_address_info *info,
3925                           rtx x, machine_mode mode,
3926                           RTX_CODE outer_code, bool strict_p)
3927 {
3928   enum rtx_code code = GET_CODE (x);
3929   rtx op0, op1;
3930
3931   /* On BE, we use load/store pair for all large int mode load/stores.  */
3932   bool load_store_pair_p = (outer_code == PARALLEL
3933                             || (BYTES_BIG_ENDIAN
3934                                 && aarch64_vect_struct_mode_p (mode)));
3935
3936   bool allow_reg_index_p =
3937     !load_store_pair_p
3938     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3939     && !aarch64_vect_struct_mode_p (mode);
3940
3941   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3942      REG addressing.  */
3943   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3944       && (code != POST_INC && code != REG))
3945     return false;
3946
3947   switch (code)
3948     {
3949     case REG:
3950     case SUBREG:
3951       info->type = ADDRESS_REG_IMM;
3952       info->base = x;
3953       info->offset = const0_rtx;
3954       return aarch64_base_register_rtx_p (x, strict_p);
3955
3956     case PLUS:
3957       op0 = XEXP (x, 0);
3958       op1 = XEXP (x, 1);
3959
3960       if (! strict_p
3961           && REG_P (op0)
3962           && virt_or_elim_regno_p (REGNO (op0))
3963           && CONST_INT_P (op1))
3964         {
3965           info->type = ADDRESS_REG_IMM;
3966           info->base = op0;
3967           info->offset = op1;
3968
3969           return true;
3970         }
3971
3972       if (GET_MODE_SIZE (mode) != 0
3973           && CONST_INT_P (op1)
3974           && aarch64_base_register_rtx_p (op0, strict_p))
3975         {
3976           HOST_WIDE_INT offset = INTVAL (op1);
3977
3978           info->type = ADDRESS_REG_IMM;
3979           info->base = op0;
3980           info->offset = op1;
3981
3982           /* TImode and TFmode values are allowed in both pairs of X
3983              registers and individual Q registers.  The available
3984              address modes are:
3985              X,X: 7-bit signed scaled offset
3986              Q:   9-bit signed offset
3987              We conservatively require an offset representable in either mode.
3988              When performing the check for pairs of X registers i.e.  LDP/STP
3989              pass down DImode since that is the natural size of the LDP/STP
3990              instruction memory accesses.  */
3991           if (mode == TImode || mode == TFmode)
3992             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
3993                     && offset_9bit_signed_unscaled_p (mode, offset));
3994
3995           /* A 7bit offset check because OImode will emit a ldp/stp
3996              instruction (only big endian will get here).
3997              For ldp/stp instructions, the offset is scaled for the size of a
3998              single element of the pair.  */
3999           if (mode == OImode)
4000             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4001
4002           /* Three 9/12 bit offsets checks because CImode will emit three
4003              ldr/str instructions (only big endian will get here).  */
4004           if (mode == CImode)
4005             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4006                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4007                         || offset_12bit_unsigned_scaled_p (V16QImode,
4008                                                            offset + 32)));
4009
4010           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4011              instructions (only big endian will get here).  */
4012           if (mode == XImode)
4013             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4014                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4015                                                             offset + 32));
4016
4017           if (load_store_pair_p)
4018             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4019                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4020           else
4021             return (offset_9bit_signed_unscaled_p (mode, offset)
4022                     || offset_12bit_unsigned_scaled_p (mode, offset));
4023         }
4024
4025       if (allow_reg_index_p)
4026         {
4027           /* Look for base + (scaled/extended) index register.  */
4028           if (aarch64_base_register_rtx_p (op0, strict_p)
4029               && aarch64_classify_index (info, op1, mode, strict_p))
4030             {
4031               info->base = op0;
4032               return true;
4033             }
4034           if (aarch64_base_register_rtx_p (op1, strict_p)
4035               && aarch64_classify_index (info, op0, mode, strict_p))
4036             {
4037               info->base = op1;
4038               return true;
4039             }
4040         }
4041
4042       return false;
4043
4044     case POST_INC:
4045     case POST_DEC:
4046     case PRE_INC:
4047     case PRE_DEC:
4048       info->type = ADDRESS_REG_WB;
4049       info->base = XEXP (x, 0);
4050       info->offset = NULL_RTX;
4051       return aarch64_base_register_rtx_p (info->base, strict_p);
4052
4053     case POST_MODIFY:
4054     case PRE_MODIFY:
4055       info->type = ADDRESS_REG_WB;
4056       info->base = XEXP (x, 0);
4057       if (GET_CODE (XEXP (x, 1)) == PLUS
4058           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4059           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4060           && aarch64_base_register_rtx_p (info->base, strict_p))
4061         {
4062           HOST_WIDE_INT offset;
4063           info->offset = XEXP (XEXP (x, 1), 1);
4064           offset = INTVAL (info->offset);
4065
4066           /* TImode and TFmode values are allowed in both pairs of X
4067              registers and individual Q registers.  The available
4068              address modes are:
4069              X,X: 7-bit signed scaled offset
4070              Q:   9-bit signed offset
4071              We conservatively require an offset representable in either mode.
4072            */
4073           if (mode == TImode || mode == TFmode)
4074             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4075                     && offset_9bit_signed_unscaled_p (mode, offset));
4076
4077           if (load_store_pair_p)
4078             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4079                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4080           else
4081             return offset_9bit_signed_unscaled_p (mode, offset);
4082         }
4083       return false;
4084
4085     case CONST:
4086     case SYMBOL_REF:
4087     case LABEL_REF:
4088       /* load literal: pc-relative constant pool entry.  Only supported
4089          for SI mode or larger.  */
4090       info->type = ADDRESS_SYMBOLIC;
4091
4092       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4093         {
4094           rtx sym, addend;
4095
4096           split_const (x, &sym, &addend);
4097           return ((GET_CODE (sym) == LABEL_REF
4098                    || (GET_CODE (sym) == SYMBOL_REF
4099                        && CONSTANT_POOL_ADDRESS_P (sym)
4100                        && aarch64_pcrelative_literal_loads)));
4101         }
4102       return false;
4103
4104     case LO_SUM:
4105       info->type = ADDRESS_LO_SUM;
4106       info->base = XEXP (x, 0);
4107       info->offset = XEXP (x, 1);
4108       if (allow_reg_index_p
4109           && aarch64_base_register_rtx_p (info->base, strict_p))
4110         {
4111           rtx sym, offs;
4112           split_const (info->offset, &sym, &offs);
4113           if (GET_CODE (sym) == SYMBOL_REF
4114               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4115             {
4116               /* The symbol and offset must be aligned to the access size.  */
4117               unsigned int align;
4118               unsigned int ref_size;
4119
4120               if (CONSTANT_POOL_ADDRESS_P (sym))
4121                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4122               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4123                 {
4124                   tree exp = SYMBOL_REF_DECL (sym);
4125                   align = TYPE_ALIGN (TREE_TYPE (exp));
4126                   align = CONSTANT_ALIGNMENT (exp, align);
4127                 }
4128               else if (SYMBOL_REF_DECL (sym))
4129                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4130               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4131                        && SYMBOL_REF_BLOCK (sym) != NULL)
4132                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4133               else
4134                 align = BITS_PER_UNIT;
4135
4136               ref_size = GET_MODE_SIZE (mode);
4137               if (ref_size == 0)
4138                 ref_size = GET_MODE_SIZE (DImode);
4139
4140               return ((INTVAL (offs) & (ref_size - 1)) == 0
4141                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4142             }
4143         }
4144       return false;
4145
4146     default:
4147       return false;
4148     }
4149 }
4150
4151 bool
4152 aarch64_symbolic_address_p (rtx x)
4153 {
4154   rtx offset;
4155
4156   split_const (x, &x, &offset);
4157   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4158 }
4159
4160 /* Classify the base of symbolic expression X.  */
4161
4162 enum aarch64_symbol_type
4163 aarch64_classify_symbolic_expression (rtx x)
4164 {
4165   rtx offset;
4166
4167   split_const (x, &x, &offset);
4168   return aarch64_classify_symbol (x, offset);
4169 }
4170
4171
4172 /* Return TRUE if X is a legitimate address for accessing memory in
4173    mode MODE.  */
4174 static bool
4175 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4176 {
4177   struct aarch64_address_info addr;
4178
4179   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4180 }
4181
4182 /* Return TRUE if X is a legitimate address for accessing memory in
4183    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4184    pair operation.  */
4185 bool
4186 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4187                               RTX_CODE outer_code, bool strict_p)
4188 {
4189   struct aarch64_address_info addr;
4190
4191   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4192 }
4193
4194 /* Return TRUE if rtx X is immediate constant 0.0 */
4195 bool
4196 aarch64_float_const_zero_rtx_p (rtx x)
4197 {
4198   if (GET_MODE (x) == VOIDmode)
4199     return false;
4200
4201   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4202     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4203   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4204 }
4205
4206 /* Return the fixed registers used for condition codes.  */
4207
4208 static bool
4209 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4210 {
4211   *p1 = CC_REGNUM;
4212   *p2 = INVALID_REGNUM;
4213   return true;
4214 }
4215
4216 /* Emit call insn with PAT and do aarch64-specific handling.  */
4217
4218 void
4219 aarch64_emit_call_insn (rtx pat)
4220 {
4221   rtx insn = emit_call_insn (pat);
4222
4223   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4224   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4225   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4226 }
4227
4228 machine_mode
4229 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4230 {
4231   /* All floating point compares return CCFP if it is an equality
4232      comparison, and CCFPE otherwise.  */
4233   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4234     {
4235       switch (code)
4236         {
4237         case EQ:
4238         case NE:
4239         case UNORDERED:
4240         case ORDERED:
4241         case UNLT:
4242         case UNLE:
4243         case UNGT:
4244         case UNGE:
4245         case UNEQ:
4246         case LTGT:
4247           return CCFPmode;
4248
4249         case LT:
4250         case LE:
4251         case GT:
4252         case GE:
4253           return CCFPEmode;
4254
4255         default:
4256           gcc_unreachable ();
4257         }
4258     }
4259
4260   /* Equality comparisons of short modes against zero can be performed
4261      using the TST instruction with the appropriate bitmask.  */
4262   if (y == const0_rtx && REG_P (x)
4263       && (code == EQ || code == NE)
4264       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4265     return CC_NZmode;
4266
4267   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4268       && y == const0_rtx
4269       && (code == EQ || code == NE || code == LT || code == GE)
4270       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4271           || GET_CODE (x) == NEG
4272           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4273               && CONST_INT_P (XEXP (x, 2)))))
4274     return CC_NZmode;
4275
4276   /* A compare with a shifted operand.  Because of canonicalization,
4277      the comparison will have to be swapped when we emit the assembly
4278      code.  */
4279   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4280       && (REG_P (y) || GET_CODE (y) == SUBREG)
4281       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4282           || GET_CODE (x) == LSHIFTRT
4283           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4284     return CC_SWPmode;
4285
4286   /* Similarly for a negated operand, but we can only do this for
4287      equalities.  */
4288   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4289       && (REG_P (y) || GET_CODE (y) == SUBREG)
4290       && (code == EQ || code == NE)
4291       && GET_CODE (x) == NEG)
4292     return CC_Zmode;
4293
4294   /* A test for unsigned overflow.  */
4295   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4296       && code == NE
4297       && GET_CODE (x) == PLUS
4298       && GET_CODE (y) == ZERO_EXTEND)
4299     return CC_Cmode;
4300
4301   /* For everything else, return CCmode.  */
4302   return CCmode;
4303 }
4304
4305 static int
4306 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4307
4308 int
4309 aarch64_get_condition_code (rtx x)
4310 {
4311   machine_mode mode = GET_MODE (XEXP (x, 0));
4312   enum rtx_code comp_code = GET_CODE (x);
4313
4314   if (GET_MODE_CLASS (mode) != MODE_CC)
4315     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4316   return aarch64_get_condition_code_1 (mode, comp_code);
4317 }
4318
4319 static int
4320 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4321 {
4322   switch (mode)
4323     {
4324     case CCFPmode:
4325     case CCFPEmode:
4326       switch (comp_code)
4327         {
4328         case GE: return AARCH64_GE;
4329         case GT: return AARCH64_GT;
4330         case LE: return AARCH64_LS;
4331         case LT: return AARCH64_MI;
4332         case NE: return AARCH64_NE;
4333         case EQ: return AARCH64_EQ;
4334         case ORDERED: return AARCH64_VC;
4335         case UNORDERED: return AARCH64_VS;
4336         case UNLT: return AARCH64_LT;
4337         case UNLE: return AARCH64_LE;
4338         case UNGT: return AARCH64_HI;
4339         case UNGE: return AARCH64_PL;
4340         default: return -1;
4341         }
4342       break;
4343
4344     case CCmode:
4345       switch (comp_code)
4346         {
4347         case NE: return AARCH64_NE;
4348         case EQ: return AARCH64_EQ;
4349         case GE: return AARCH64_GE;
4350         case GT: return AARCH64_GT;
4351         case LE: return AARCH64_LE;
4352         case LT: return AARCH64_LT;
4353         case GEU: return AARCH64_CS;
4354         case GTU: return AARCH64_HI;
4355         case LEU: return AARCH64_LS;
4356         case LTU: return AARCH64_CC;
4357         default: return -1;
4358         }
4359       break;
4360
4361     case CC_SWPmode:
4362       switch (comp_code)
4363         {
4364         case NE: return AARCH64_NE;
4365         case EQ: return AARCH64_EQ;
4366         case GE: return AARCH64_LE;
4367         case GT: return AARCH64_LT;
4368         case LE: return AARCH64_GE;
4369         case LT: return AARCH64_GT;
4370         case GEU: return AARCH64_LS;
4371         case GTU: return AARCH64_CC;
4372         case LEU: return AARCH64_CS;
4373         case LTU: return AARCH64_HI;
4374         default: return -1;
4375         }
4376       break;
4377
4378     case CC_NZmode:
4379       switch (comp_code)
4380         {
4381         case NE: return AARCH64_NE;
4382         case EQ: return AARCH64_EQ;
4383         case GE: return AARCH64_PL;
4384         case LT: return AARCH64_MI;
4385         default: return -1;
4386         }
4387       break;
4388
4389     case CC_Zmode:
4390       switch (comp_code)
4391         {
4392         case NE: return AARCH64_NE;
4393         case EQ: return AARCH64_EQ;
4394         default: return -1;
4395         }
4396       break;
4397
4398     case CC_Cmode:
4399       switch (comp_code)
4400         {
4401         case NE: return AARCH64_CS;
4402         case EQ: return AARCH64_CC;
4403         default: return -1;
4404         }
4405       break;
4406
4407     default:
4408       return -1;
4409       break;
4410     }
4411
4412   return -1;
4413 }
4414
4415 bool
4416 aarch64_const_vec_all_same_in_range_p (rtx x,
4417                                   HOST_WIDE_INT minval,
4418                                   HOST_WIDE_INT maxval)
4419 {
4420   HOST_WIDE_INT firstval;
4421   int count, i;
4422
4423   if (GET_CODE (x) != CONST_VECTOR
4424       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4425     return false;
4426
4427   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4428   if (firstval < minval || firstval > maxval)
4429     return false;
4430
4431   count = CONST_VECTOR_NUNITS (x);
4432   for (i = 1; i < count; i++)
4433     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4434       return false;
4435
4436   return true;
4437 }
4438
4439 bool
4440 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4441 {
4442   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4443 }
4444
4445
4446 /* N Z C V.  */
4447 #define AARCH64_CC_V 1
4448 #define AARCH64_CC_C (1 << 1)
4449 #define AARCH64_CC_Z (1 << 2)
4450 #define AARCH64_CC_N (1 << 3)
4451
4452 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
4453 static const int aarch64_nzcv_codes[] =
4454 {
4455   0,            /* EQ, Z == 1.  */
4456   AARCH64_CC_Z, /* NE, Z == 0.  */
4457   0,            /* CS, C == 1.  */
4458   AARCH64_CC_C, /* CC, C == 0.  */
4459   0,            /* MI, N == 1.  */
4460   AARCH64_CC_N, /* PL, N == 0.  */
4461   0,            /* VS, V == 1.  */
4462   AARCH64_CC_V, /* VC, V == 0.  */
4463   0,            /* HI, C ==1 && Z == 0.  */
4464   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
4465   AARCH64_CC_V, /* GE, N == V.  */
4466   0,            /* LT, N != V.  */
4467   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
4468   0,            /* LE, !(Z == 0 && N == V).  */
4469   0,            /* AL, Any.  */
4470   0             /* NV, Any.  */
4471 };
4472
4473 static void
4474 aarch64_print_operand (FILE *f, rtx x, int code)
4475 {
4476   switch (code)
4477     {
4478     /* An integer or symbol address without a preceding # sign.  */
4479     case 'c':
4480       switch (GET_CODE (x))
4481         {
4482         case CONST_INT:
4483           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4484           break;
4485
4486         case SYMBOL_REF:
4487           output_addr_const (f, x);
4488           break;
4489
4490         case CONST:
4491           if (GET_CODE (XEXP (x, 0)) == PLUS
4492               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4493             {
4494               output_addr_const (f, x);
4495               break;
4496             }
4497           /* Fall through.  */
4498
4499         default:
4500           output_operand_lossage ("Unsupported operand for code '%c'", code);
4501         }
4502       break;
4503
4504     case 'e':
4505       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4506       {
4507         int n;
4508
4509         if (!CONST_INT_P (x)
4510             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4511           {
4512             output_operand_lossage ("invalid operand for '%%%c'", code);
4513             return;
4514           }
4515
4516         switch (n)
4517           {
4518           case 3:
4519             fputc ('b', f);
4520             break;
4521           case 4:
4522             fputc ('h', f);
4523             break;
4524           case 5:
4525             fputc ('w', f);
4526             break;
4527           default:
4528             output_operand_lossage ("invalid operand for '%%%c'", code);
4529             return;
4530           }
4531       }
4532       break;
4533
4534     case 'p':
4535       {
4536         int n;
4537
4538         /* Print N such that 2^N == X.  */
4539         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4540           {
4541             output_operand_lossage ("invalid operand for '%%%c'", code);
4542             return;
4543           }
4544
4545         asm_fprintf (f, "%d", n);
4546       }
4547       break;
4548
4549     case 'P':
4550       /* Print the number of non-zero bits in X (a const_int).  */
4551       if (!CONST_INT_P (x))
4552         {
4553           output_operand_lossage ("invalid operand for '%%%c'", code);
4554           return;
4555         }
4556
4557       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4558       break;
4559
4560     case 'H':
4561       /* Print the higher numbered register of a pair (TImode) of regs.  */
4562       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4563         {
4564           output_operand_lossage ("invalid operand for '%%%c'", code);
4565           return;
4566         }
4567
4568       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4569       break;
4570
4571     case 'M':
4572     case 'm':
4573       {
4574         int cond_code;
4575         /* Print a condition (eq, ne, etc) or its inverse.  */
4576
4577         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
4578         if (x == const_true_rtx)
4579           {
4580             if (code == 'M')
4581               fputs ("nv", f);
4582             return;
4583           }
4584
4585         if (!COMPARISON_P (x))
4586           {
4587             output_operand_lossage ("invalid operand for '%%%c'", code);
4588             return;
4589           }
4590
4591         cond_code = aarch64_get_condition_code (x);
4592         gcc_assert (cond_code >= 0);
4593         if (code == 'M')
4594           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
4595         fputs (aarch64_condition_codes[cond_code], f);
4596       }
4597       break;
4598
4599     case 'b':
4600     case 'h':
4601     case 's':
4602     case 'd':
4603     case 'q':
4604       /* Print a scalar FP/SIMD register name.  */
4605       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4606         {
4607           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4608           return;
4609         }
4610       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4611       break;
4612
4613     case 'S':
4614     case 'T':
4615     case 'U':
4616     case 'V':
4617       /* Print the first FP/SIMD register name in a list.  */
4618       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4619         {
4620           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4621           return;
4622         }
4623       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4624       break;
4625
4626     case 'R':
4627       /* Print a scalar FP/SIMD register name + 1.  */
4628       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4629         {
4630           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4631           return;
4632         }
4633       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4634       break;
4635
4636     case 'X':
4637       /* Print bottom 16 bits of integer constant in hex.  */
4638       if (!CONST_INT_P (x))
4639         {
4640           output_operand_lossage ("invalid operand for '%%%c'", code);
4641           return;
4642         }
4643       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4644       break;
4645
4646     case 'w':
4647     case 'x':
4648       /* Print a general register name or the zero register (32-bit or
4649          64-bit).  */
4650       if (x == const0_rtx
4651           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4652         {
4653           asm_fprintf (f, "%czr", code);
4654           break;
4655         }
4656
4657       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4658         {
4659           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4660           break;
4661         }
4662
4663       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4664         {
4665           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4666           break;
4667         }
4668
4669       /* Fall through */
4670
4671     case 0:
4672       /* Print a normal operand, if it's a general register, then we
4673          assume DImode.  */
4674       if (x == NULL)
4675         {
4676           output_operand_lossage ("missing operand");
4677           return;
4678         }
4679
4680       switch (GET_CODE (x))
4681         {
4682         case REG:
4683           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4684           break;
4685
4686         case MEM:
4687           output_address (GET_MODE (x), XEXP (x, 0));
4688           break;
4689
4690         case CONST:
4691         case LABEL_REF:
4692         case SYMBOL_REF:
4693           output_addr_const (asm_out_file, x);
4694           break;
4695
4696         case CONST_INT:
4697           asm_fprintf (f, "%wd", INTVAL (x));
4698           break;
4699
4700         case CONST_VECTOR:
4701           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4702             {
4703               gcc_assert (
4704                   aarch64_const_vec_all_same_in_range_p (x,
4705                                                          HOST_WIDE_INT_MIN,
4706                                                          HOST_WIDE_INT_MAX));
4707               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4708             }
4709           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4710             {
4711               fputc ('0', f);
4712             }
4713           else
4714             gcc_unreachable ();
4715           break;
4716
4717         case CONST_DOUBLE:
4718           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4719              be getting CONST_DOUBLEs holding integers.  */
4720           gcc_assert (GET_MODE (x) != VOIDmode);
4721           if (aarch64_float_const_zero_rtx_p (x))
4722             {
4723               fputc ('0', f);
4724               break;
4725             }
4726           else if (aarch64_float_const_representable_p (x))
4727             {
4728 #define buf_size 20
4729               char float_buf[buf_size] = {'\0'};
4730               real_to_decimal_for_mode (float_buf,
4731                                         CONST_DOUBLE_REAL_VALUE (x),
4732                                         buf_size, buf_size,
4733                                         1, GET_MODE (x));
4734               asm_fprintf (asm_out_file, "%s", float_buf);
4735               break;
4736 #undef buf_size
4737             }
4738           output_operand_lossage ("invalid constant");
4739           return;
4740         default:
4741           output_operand_lossage ("invalid operand");
4742           return;
4743         }
4744       break;
4745
4746     case 'A':
4747       if (GET_CODE (x) == HIGH)
4748         x = XEXP (x, 0);
4749
4750       switch (aarch64_classify_symbolic_expression (x))
4751         {
4752         case SYMBOL_SMALL_GOT_4G:
4753           asm_fprintf (asm_out_file, ":got:");
4754           break;
4755
4756         case SYMBOL_SMALL_TLSGD:
4757           asm_fprintf (asm_out_file, ":tlsgd:");
4758           break;
4759
4760         case SYMBOL_SMALL_TLSDESC:
4761           asm_fprintf (asm_out_file, ":tlsdesc:");
4762           break;
4763
4764         case SYMBOL_SMALL_TLSIE:
4765           asm_fprintf (asm_out_file, ":gottprel:");
4766           break;
4767
4768         case SYMBOL_TLSLE24:
4769           asm_fprintf (asm_out_file, ":tprel:");
4770           break;
4771
4772         case SYMBOL_TINY_GOT:
4773           gcc_unreachable ();
4774           break;
4775
4776         default:
4777           break;
4778         }
4779       output_addr_const (asm_out_file, x);
4780       break;
4781
4782     case 'L':
4783       switch (aarch64_classify_symbolic_expression (x))
4784         {
4785         case SYMBOL_SMALL_GOT_4G:
4786           asm_fprintf (asm_out_file, ":lo12:");
4787           break;
4788
4789         case SYMBOL_SMALL_TLSGD:
4790           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4791           break;
4792
4793         case SYMBOL_SMALL_TLSDESC:
4794           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4795           break;
4796
4797         case SYMBOL_SMALL_TLSIE:
4798           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4799           break;
4800
4801         case SYMBOL_TLSLE12:
4802           asm_fprintf (asm_out_file, ":tprel_lo12:");
4803           break;
4804
4805         case SYMBOL_TLSLE24:
4806           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4807           break;
4808
4809         case SYMBOL_TINY_GOT:
4810           asm_fprintf (asm_out_file, ":got:");
4811           break;
4812
4813         case SYMBOL_TINY_TLSIE:
4814           asm_fprintf (asm_out_file, ":gottprel:");
4815           break;
4816
4817         default:
4818           break;
4819         }
4820       output_addr_const (asm_out_file, x);
4821       break;
4822
4823     case 'G':
4824
4825       switch (aarch64_classify_symbolic_expression (x))
4826         {
4827         case SYMBOL_TLSLE24:
4828           asm_fprintf (asm_out_file, ":tprel_hi12:");
4829           break;
4830         default:
4831           break;
4832         }
4833       output_addr_const (asm_out_file, x);
4834       break;
4835
4836     case 'k':
4837       {
4838         HOST_WIDE_INT cond_code;
4839         /* Print nzcv.  */
4840
4841         if (!CONST_INT_P (x))
4842           {
4843             output_operand_lossage ("invalid operand for '%%%c'", code);
4844             return;
4845           }
4846
4847         cond_code = INTVAL (x);
4848         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
4849         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
4850       }
4851       break;
4852
4853     default:
4854       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4855       return;
4856     }
4857 }
4858
4859 static void
4860 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
4861 {
4862   struct aarch64_address_info addr;
4863
4864   if (aarch64_classify_address (&addr, x, mode, MEM, true))
4865     switch (addr.type)
4866       {
4867       case ADDRESS_REG_IMM:
4868         if (addr.offset == const0_rtx)
4869           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4870         else
4871           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4872                        INTVAL (addr.offset));
4873         return;
4874
4875       case ADDRESS_REG_REG:
4876         if (addr.shift == 0)
4877           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4878                        reg_names [REGNO (addr.offset)]);
4879         else
4880           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4881                        reg_names [REGNO (addr.offset)], addr.shift);
4882         return;
4883
4884       case ADDRESS_REG_UXTW:
4885         if (addr.shift == 0)
4886           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4887                        REGNO (addr.offset) - R0_REGNUM);
4888         else
4889           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4890                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4891         return;
4892
4893       case ADDRESS_REG_SXTW:
4894         if (addr.shift == 0)
4895           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4896                        REGNO (addr.offset) - R0_REGNUM);
4897         else
4898           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4899                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4900         return;
4901
4902       case ADDRESS_REG_WB:
4903         switch (GET_CODE (x))
4904           {
4905           case PRE_INC:
4906             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4907                          GET_MODE_SIZE (mode));
4908             return;
4909           case POST_INC:
4910             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4911                          GET_MODE_SIZE (mode));
4912             return;
4913           case PRE_DEC:
4914             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4915                          GET_MODE_SIZE (mode));
4916             return;
4917           case POST_DEC:
4918             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4919                          GET_MODE_SIZE (mode));
4920             return;
4921           case PRE_MODIFY:
4922             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4923                          INTVAL (addr.offset));
4924             return;
4925           case POST_MODIFY:
4926             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4927                          INTVAL (addr.offset));
4928             return;
4929           default:
4930             break;
4931           }
4932         break;
4933
4934       case ADDRESS_LO_SUM:
4935         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4936         output_addr_const (f, addr.offset);
4937         asm_fprintf (f, "]");
4938         return;
4939
4940       case ADDRESS_SYMBOLIC:
4941         break;
4942       }
4943
4944   output_addr_const (f, x);
4945 }
4946
4947 bool
4948 aarch64_label_mentioned_p (rtx x)
4949 {
4950   const char *fmt;
4951   int i;
4952
4953   if (GET_CODE (x) == LABEL_REF)
4954     return true;
4955
4956   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4957      referencing instruction, but they are constant offsets, not
4958      symbols.  */
4959   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4960     return false;
4961
4962   fmt = GET_RTX_FORMAT (GET_CODE (x));
4963   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4964     {
4965       if (fmt[i] == 'E')
4966         {
4967           int j;
4968
4969           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4970             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4971               return 1;
4972         }
4973       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4974         return 1;
4975     }
4976
4977   return 0;
4978 }
4979
4980 /* Implement REGNO_REG_CLASS.  */
4981
4982 enum reg_class
4983 aarch64_regno_regclass (unsigned regno)
4984 {
4985   if (GP_REGNUM_P (regno))
4986     return GENERAL_REGS;
4987
4988   if (regno == SP_REGNUM)
4989     return STACK_REG;
4990
4991   if (regno == FRAME_POINTER_REGNUM
4992       || regno == ARG_POINTER_REGNUM)
4993     return POINTER_REGS;
4994
4995   if (FP_REGNUM_P (regno))
4996     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4997
4998   return NO_REGS;
4999 }
5000
5001 static rtx
5002 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5003 {
5004   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5005      where mask is selected by alignment and size of the offset.
5006      We try to pick as large a range for the offset as possible to
5007      maximize the chance of a CSE.  However, for aligned addresses
5008      we limit the range to 4k so that structures with different sized
5009      elements are likely to use the same base.  We need to be careful
5010      not to split a CONST for some forms of address expression, otherwise
5011      it will generate sub-optimal code.  */
5012
5013   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5014     {
5015       rtx base = XEXP (x, 0);
5016       rtx offset_rtx = XEXP (x, 1);
5017       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5018
5019       if (GET_CODE (base) == PLUS)
5020         {
5021           rtx op0 = XEXP (base, 0);
5022           rtx op1 = XEXP (base, 1);
5023
5024           /* Force any scaling into a temp for CSE.  */
5025           op0 = force_reg (Pmode, op0);
5026           op1 = force_reg (Pmode, op1);
5027
5028           /* Let the pointer register be in op0.  */
5029           if (REG_POINTER (op1))
5030             std::swap (op0, op1);
5031
5032           /* If the pointer is virtual or frame related, then we know that
5033              virtual register instantiation or register elimination is going
5034              to apply a second constant.  We want the two constants folded
5035              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5036           if (virt_or_elim_regno_p (REGNO (op0)))
5037             {
5038               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5039                                    NULL_RTX, true, OPTAB_DIRECT);
5040               return gen_rtx_PLUS (Pmode, base, op1);
5041             }
5042
5043           /* Otherwise, in order to encourage CSE (and thence loop strength
5044              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5045           base = expand_binop (Pmode, add_optab, op0, op1,
5046                                NULL_RTX, true, OPTAB_DIRECT);
5047           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5048         }
5049
5050       /* Does it look like we'll need a load/store-pair operation?  */
5051       HOST_WIDE_INT base_offset;
5052       if (GET_MODE_SIZE (mode) > 16
5053           || mode == TImode)
5054         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
5055                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
5056       /* For offsets aren't a multiple of the access size, the limit is
5057          -256...255.  */
5058       else if (offset & (GET_MODE_SIZE (mode) - 1))
5059         base_offset = (offset + 0x100) & ~0x1ff;
5060       else
5061         base_offset = offset & ~0xfff;
5062
5063       if (base_offset != 0)
5064         {
5065           base = plus_constant (Pmode, base, base_offset);
5066           base = force_operand (base, NULL_RTX);
5067           return plus_constant (Pmode, base, offset - base_offset);
5068         }
5069     }
5070
5071   return x;
5072 }
5073
5074 /* Return the reload icode required for a constant pool in mode.  */
5075 static enum insn_code
5076 aarch64_constant_pool_reload_icode (machine_mode mode)
5077 {
5078   switch (mode)
5079     {
5080     case SFmode:
5081       return CODE_FOR_aarch64_reload_movcpsfdi;
5082
5083     case DFmode:
5084       return CODE_FOR_aarch64_reload_movcpdfdi;
5085
5086     case TFmode:
5087       return CODE_FOR_aarch64_reload_movcptfdi;
5088
5089     case V8QImode:
5090       return CODE_FOR_aarch64_reload_movcpv8qidi;
5091
5092     case V16QImode:
5093       return CODE_FOR_aarch64_reload_movcpv16qidi;
5094
5095     case V4HImode:
5096       return CODE_FOR_aarch64_reload_movcpv4hidi;
5097
5098     case V8HImode:
5099       return CODE_FOR_aarch64_reload_movcpv8hidi;
5100
5101     case V2SImode:
5102       return CODE_FOR_aarch64_reload_movcpv2sidi;
5103
5104     case V4SImode:
5105       return CODE_FOR_aarch64_reload_movcpv4sidi;
5106
5107     case V2DImode:
5108       return CODE_FOR_aarch64_reload_movcpv2didi;
5109
5110     case V2DFmode:
5111       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5112
5113     default:
5114       gcc_unreachable ();
5115     }
5116
5117   gcc_unreachable ();
5118 }
5119 static reg_class_t
5120 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5121                           reg_class_t rclass,
5122                           machine_mode mode,
5123                           secondary_reload_info *sri)
5124 {
5125
5126   /* If we have to disable direct literal pool loads and stores because the
5127      function is too big, then we need a scratch register.  */
5128   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5129       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5130           || targetm.vector_mode_supported_p (GET_MODE (x)))
5131       && !aarch64_pcrelative_literal_loads)
5132     {
5133       sri->icode = aarch64_constant_pool_reload_icode (mode);
5134       return NO_REGS;
5135     }
5136
5137   /* Without the TARGET_SIMD instructions we cannot move a Q register
5138      to a Q register directly.  We need a scratch.  */
5139   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5140       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5141       && reg_class_subset_p (rclass, FP_REGS))
5142     {
5143       if (mode == TFmode)
5144         sri->icode = CODE_FOR_aarch64_reload_movtf;
5145       else if (mode == TImode)
5146         sri->icode = CODE_FOR_aarch64_reload_movti;
5147       return NO_REGS;
5148     }
5149
5150   /* A TFmode or TImode memory access should be handled via an FP_REGS
5151      because AArch64 has richer addressing modes for LDR/STR instructions
5152      than LDP/STP instructions.  */
5153   if (TARGET_FLOAT && rclass == GENERAL_REGS
5154       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5155     return FP_REGS;
5156
5157   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5158       return GENERAL_REGS;
5159
5160   return NO_REGS;
5161 }
5162
5163 static bool
5164 aarch64_can_eliminate (const int from, const int to)
5165 {
5166   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5167      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5168
5169   if (frame_pointer_needed)
5170     {
5171       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5172         return true;
5173       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5174         return false;
5175       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5176           && !cfun->calls_alloca)
5177         return true;
5178       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5179         return true;
5180
5181       return false;
5182     }
5183   else
5184     {
5185       /* If we decided that we didn't need a leaf frame pointer but then used
5186          LR in the function, then we'll want a frame pointer after all, so
5187          prevent this elimination to ensure a frame pointer is used.  */
5188       if (to == STACK_POINTER_REGNUM
5189           && flag_omit_leaf_frame_pointer
5190           && df_regs_ever_live_p (LR_REGNUM))
5191         return false;
5192     }
5193
5194   return true;
5195 }
5196
5197 HOST_WIDE_INT
5198 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5199 {
5200   aarch64_layout_frame ();
5201
5202   if (to == HARD_FRAME_POINTER_REGNUM)
5203     {
5204       if (from == ARG_POINTER_REGNUM)
5205         return cfun->machine->frame.hard_fp_offset;
5206
5207       if (from == FRAME_POINTER_REGNUM)
5208         return cfun->machine->frame.hard_fp_offset
5209                - cfun->machine->frame.locals_offset;
5210     }
5211
5212   if (to == STACK_POINTER_REGNUM)
5213     {
5214       if (from == FRAME_POINTER_REGNUM)
5215           return cfun->machine->frame.frame_size
5216                  - cfun->machine->frame.locals_offset;
5217     }
5218
5219   return cfun->machine->frame.frame_size;
5220 }
5221
5222 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5223    previous frame.  */
5224
5225 rtx
5226 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5227 {
5228   if (count != 0)
5229     return const0_rtx;
5230   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5231 }
5232
5233
5234 static void
5235 aarch64_asm_trampoline_template (FILE *f)
5236 {
5237   if (TARGET_ILP32)
5238     {
5239       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5240       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5241     }
5242   else
5243     {
5244       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5245       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5246     }
5247   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5248   assemble_aligned_integer (4, const0_rtx);
5249   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5250   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5251 }
5252
5253 static void
5254 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5255 {
5256   rtx fnaddr, mem, a_tramp;
5257   const int tramp_code_sz = 16;
5258
5259   /* Don't need to copy the trailing D-words, we fill those in below.  */
5260   emit_block_move (m_tramp, assemble_trampoline_template (),
5261                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5262   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5263   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5264   if (GET_MODE (fnaddr) != ptr_mode)
5265     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5266   emit_move_insn (mem, fnaddr);
5267
5268   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5269   emit_move_insn (mem, chain_value);
5270
5271   /* XXX We should really define a "clear_cache" pattern and use
5272      gen_clear_cache().  */
5273   a_tramp = XEXP (m_tramp, 0);
5274   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5275                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5276                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5277                      ptr_mode);
5278 }
5279
5280 static unsigned char
5281 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5282 {
5283   switch (regclass)
5284     {
5285     case CALLER_SAVE_REGS:
5286     case POINTER_REGS:
5287     case GENERAL_REGS:
5288     case ALL_REGS:
5289     case FP_REGS:
5290     case FP_LO_REGS:
5291       return
5292         aarch64_vector_mode_p (mode)
5293           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5294           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5295     case STACK_REG:
5296       return 1;
5297
5298     case NO_REGS:
5299       return 0;
5300
5301     default:
5302       break;
5303     }
5304   gcc_unreachable ();
5305 }
5306
5307 static reg_class_t
5308 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5309 {
5310   if (regclass == POINTER_REGS)
5311     return GENERAL_REGS;
5312
5313   if (regclass == STACK_REG)
5314     {
5315       if (REG_P(x)
5316           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5317           return regclass;
5318
5319       return NO_REGS;
5320     }
5321
5322   /* If it's an integer immediate that MOVI can't handle, then
5323      FP_REGS is not an option, so we return NO_REGS instead.  */
5324   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5325       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5326     return NO_REGS;
5327
5328   /* Register eliminiation can result in a request for
5329      SP+constant->FP_REGS.  We cannot support such operations which
5330      use SP as source and an FP_REG as destination, so reject out
5331      right now.  */
5332   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5333     {
5334       rtx lhs = XEXP (x, 0);
5335
5336       /* Look through a possible SUBREG introduced by ILP32.  */
5337       if (GET_CODE (lhs) == SUBREG)
5338         lhs = SUBREG_REG (lhs);
5339
5340       gcc_assert (REG_P (lhs));
5341       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5342                                       POINTER_REGS));
5343       return NO_REGS;
5344     }
5345
5346   return regclass;
5347 }
5348
5349 void
5350 aarch64_asm_output_labelref (FILE* f, const char *name)
5351 {
5352   asm_fprintf (f, "%U%s", name);
5353 }
5354
5355 static void
5356 aarch64_elf_asm_constructor (rtx symbol, int priority)
5357 {
5358   if (priority == DEFAULT_INIT_PRIORITY)
5359     default_ctor_section_asm_out_constructor (symbol, priority);
5360   else
5361     {
5362       section *s;
5363       char buf[18];
5364       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5365       s = get_section (buf, SECTION_WRITE, NULL);
5366       switch_to_section (s);
5367       assemble_align (POINTER_SIZE);
5368       assemble_aligned_integer (POINTER_BYTES, symbol);
5369     }
5370 }
5371
5372 static void
5373 aarch64_elf_asm_destructor (rtx symbol, int priority)
5374 {
5375   if (priority == DEFAULT_INIT_PRIORITY)
5376     default_dtor_section_asm_out_destructor (symbol, priority);
5377   else
5378     {
5379       section *s;
5380       char buf[18];
5381       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5382       s = get_section (buf, SECTION_WRITE, NULL);
5383       switch_to_section (s);
5384       assemble_align (POINTER_SIZE);
5385       assemble_aligned_integer (POINTER_BYTES, symbol);
5386     }
5387 }
5388
5389 const char*
5390 aarch64_output_casesi (rtx *operands)
5391 {
5392   char buf[100];
5393   char label[100];
5394   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5395   int index;
5396   static const char *const patterns[4][2] =
5397   {
5398     {
5399       "ldrb\t%w3, [%0,%w1,uxtw]",
5400       "add\t%3, %4, %w3, sxtb #2"
5401     },
5402     {
5403       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5404       "add\t%3, %4, %w3, sxth #2"
5405     },
5406     {
5407       "ldr\t%w3, [%0,%w1,uxtw #2]",
5408       "add\t%3, %4, %w3, sxtw #2"
5409     },
5410     /* We assume that DImode is only generated when not optimizing and
5411        that we don't really need 64-bit address offsets.  That would
5412        imply an object file with 8GB of code in a single function!  */
5413     {
5414       "ldr\t%w3, [%0,%w1,uxtw #2]",
5415       "add\t%3, %4, %w3, sxtw #2"
5416     }
5417   };
5418
5419   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5420
5421   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5422
5423   gcc_assert (index >= 0 && index <= 3);
5424
5425   /* Need to implement table size reduction, by chaning the code below.  */
5426   output_asm_insn (patterns[index][0], operands);
5427   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5428   snprintf (buf, sizeof (buf),
5429             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5430   output_asm_insn (buf, operands);
5431   output_asm_insn (patterns[index][1], operands);
5432   output_asm_insn ("br\t%3", operands);
5433   assemble_label (asm_out_file, label);
5434   return "";
5435 }
5436
5437
5438 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5439    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5440    operator.  */
5441
5442 int
5443 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5444 {
5445   if (shift >= 0 && shift <= 3)
5446     {
5447       int size;
5448       for (size = 8; size <= 32; size *= 2)
5449         {
5450           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5451           if (mask == bits << shift)
5452             return size;
5453         }
5454     }
5455   return 0;
5456 }
5457
5458 /* Constant pools are per function only when PC relative
5459    literal loads are true or we are in the large memory
5460    model.  */
5461
5462 static inline bool
5463 aarch64_can_use_per_function_literal_pools_p (void)
5464 {
5465   return (aarch64_pcrelative_literal_loads
5466           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5467 }
5468
5469 static bool
5470 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5471 {
5472   /* Fixme:: In an ideal world this would work similar
5473      to the logic in aarch64_select_rtx_section but this
5474      breaks bootstrap in gcc go.  For now we workaround
5475      this by returning false here.  */
5476   return false;
5477 }
5478
5479 /* Select appropriate section for constants depending
5480    on where we place literal pools.  */
5481
5482 static section *
5483 aarch64_select_rtx_section (machine_mode mode,
5484                             rtx x,
5485                             unsigned HOST_WIDE_INT align)
5486 {
5487   if (aarch64_can_use_per_function_literal_pools_p ())
5488     return function_section (current_function_decl);
5489
5490   return default_elf_select_rtx_section (mode, x, align);
5491 }
5492
5493 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
5494 void
5495 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5496                                   HOST_WIDE_INT offset)
5497 {
5498   /* When using per-function literal pools, we must ensure that any code
5499      section is aligned to the minimal instruction length, lest we get
5500      errors from the assembler re "unaligned instructions".  */
5501   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5502     ASM_OUTPUT_ALIGN (f, 2);
5503 }
5504
5505 /* Costs.  */
5506
5507 /* Helper function for rtx cost calculation.  Strip a shift expression
5508    from X.  Returns the inner operand if successful, or the original
5509    expression on failure.  */
5510 static rtx
5511 aarch64_strip_shift (rtx x)
5512 {
5513   rtx op = x;
5514
5515   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5516      we can convert both to ROR during final output.  */
5517   if ((GET_CODE (op) == ASHIFT
5518        || GET_CODE (op) == ASHIFTRT
5519        || GET_CODE (op) == LSHIFTRT
5520        || GET_CODE (op) == ROTATERT
5521        || GET_CODE (op) == ROTATE)
5522       && CONST_INT_P (XEXP (op, 1)))
5523     return XEXP (op, 0);
5524
5525   if (GET_CODE (op) == MULT
5526       && CONST_INT_P (XEXP (op, 1))
5527       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5528     return XEXP (op, 0);
5529
5530   return x;
5531 }
5532
5533 /* Helper function for rtx cost calculation.  Strip an extend
5534    expression from X.  Returns the inner operand if successful, or the
5535    original expression on failure.  We deal with a number of possible
5536    canonicalization variations here.  */
5537 static rtx
5538 aarch64_strip_extend (rtx x)
5539 {
5540   rtx op = x;
5541
5542   /* Zero and sign extraction of a widened value.  */
5543   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5544       && XEXP (op, 2) == const0_rtx
5545       && GET_CODE (XEXP (op, 0)) == MULT
5546       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5547                                          XEXP (op, 1)))
5548     return XEXP (XEXP (op, 0), 0);
5549
5550   /* It can also be represented (for zero-extend) as an AND with an
5551      immediate.  */
5552   if (GET_CODE (op) == AND
5553       && GET_CODE (XEXP (op, 0)) == MULT
5554       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5555       && CONST_INT_P (XEXP (op, 1))
5556       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5557                            INTVAL (XEXP (op, 1))) != 0)
5558     return XEXP (XEXP (op, 0), 0);
5559
5560   /* Now handle extended register, as this may also have an optional
5561      left shift by 1..4.  */
5562   if (GET_CODE (op) == ASHIFT
5563       && CONST_INT_P (XEXP (op, 1))
5564       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5565     op = XEXP (op, 0);
5566
5567   if (GET_CODE (op) == ZERO_EXTEND
5568       || GET_CODE (op) == SIGN_EXTEND)
5569     op = XEXP (op, 0);
5570
5571   if (op != x)
5572     return op;
5573
5574   return x;
5575 }
5576
5577 /* Return true iff CODE is a shift supported in combination
5578    with arithmetic instructions.  */
5579
5580 static bool
5581 aarch64_shift_p (enum rtx_code code)
5582 {
5583   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5584 }
5585
5586 /* Helper function for rtx cost calculation.  Calculate the cost of
5587    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5588    Return the calculated cost of the expression, recursing manually in to
5589    operands where needed.  */
5590
5591 static int
5592 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5593 {
5594   rtx op0, op1;
5595   const struct cpu_cost_table *extra_cost
5596     = aarch64_tune_params.insn_extra_cost;
5597   int cost = 0;
5598   bool compound_p = (outer == PLUS || outer == MINUS);
5599   machine_mode mode = GET_MODE (x);
5600
5601   gcc_checking_assert (code == MULT);
5602
5603   op0 = XEXP (x, 0);
5604   op1 = XEXP (x, 1);
5605
5606   if (VECTOR_MODE_P (mode))
5607     mode = GET_MODE_INNER (mode);
5608
5609   /* Integer multiply/fma.  */
5610   if (GET_MODE_CLASS (mode) == MODE_INT)
5611     {
5612       /* The multiply will be canonicalized as a shift, cost it as such.  */
5613       if (aarch64_shift_p (GET_CODE (x))
5614           || (CONST_INT_P (op1)
5615               && exact_log2 (INTVAL (op1)) > 0))
5616         {
5617           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5618                            || GET_CODE (op0) == SIGN_EXTEND;
5619           if (speed)
5620             {
5621               if (compound_p)
5622                 {
5623                   if (REG_P (op1))
5624                     /* ARITH + shift-by-register.  */
5625                     cost += extra_cost->alu.arith_shift_reg;
5626                   else if (is_extend)
5627                     /* ARITH + extended register.  We don't have a cost field
5628                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5629                     cost += extra_cost->alu.extend_arith;
5630                   else
5631                     /* ARITH + shift-by-immediate.  */
5632                     cost += extra_cost->alu.arith_shift;
5633                 }
5634               else
5635                 /* LSL (immediate).  */
5636                 cost += extra_cost->alu.shift;
5637
5638             }
5639           /* Strip extends as we will have costed them in the case above.  */
5640           if (is_extend)
5641             op0 = aarch64_strip_extend (op0);
5642
5643           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5644
5645           return cost;
5646         }
5647
5648       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5649          compound and let the below cases handle it.  After all, MNEG is a
5650          special-case alias of MSUB.  */
5651       if (GET_CODE (op0) == NEG)
5652         {
5653           op0 = XEXP (op0, 0);
5654           compound_p = true;
5655         }
5656
5657       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5658       if ((GET_CODE (op0) == ZERO_EXTEND
5659            && GET_CODE (op1) == ZERO_EXTEND)
5660           || (GET_CODE (op0) == SIGN_EXTEND
5661               && GET_CODE (op1) == SIGN_EXTEND))
5662         {
5663           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5664           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5665
5666           if (speed)
5667             {
5668               if (compound_p)
5669                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5670                 cost += extra_cost->mult[0].extend_add;
5671               else
5672                 /* MUL/SMULL/UMULL.  */
5673                 cost += extra_cost->mult[0].extend;
5674             }
5675
5676           return cost;
5677         }
5678
5679       /* This is either an integer multiply or a MADD.  In both cases
5680          we want to recurse and cost the operands.  */
5681       cost += rtx_cost (op0, mode, MULT, 0, speed);
5682       cost += rtx_cost (op1, mode, MULT, 1, speed);
5683
5684       if (speed)
5685         {
5686           if (compound_p)
5687             /* MADD/MSUB.  */
5688             cost += extra_cost->mult[mode == DImode].add;
5689           else
5690             /* MUL.  */
5691             cost += extra_cost->mult[mode == DImode].simple;
5692         }
5693
5694       return cost;
5695     }
5696   else
5697     {
5698       if (speed)
5699         {
5700           /* Floating-point FMA/FMUL can also support negations of the
5701              operands, unless the rounding mode is upward or downward in
5702              which case FNMUL is different than FMUL with operand negation.  */
5703           bool neg0 = GET_CODE (op0) == NEG;
5704           bool neg1 = GET_CODE (op1) == NEG;
5705           if (compound_p || !flag_rounding_math || (neg0 && neg1))
5706             {
5707               if (neg0)
5708                 op0 = XEXP (op0, 0);
5709               if (neg1)
5710                 op1 = XEXP (op1, 0);
5711             }
5712
5713           if (compound_p)
5714             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5715             cost += extra_cost->fp[mode == DFmode].fma;
5716           else
5717             /* FMUL/FNMUL.  */
5718             cost += extra_cost->fp[mode == DFmode].mult;
5719         }
5720
5721       cost += rtx_cost (op0, mode, MULT, 0, speed);
5722       cost += rtx_cost (op1, mode, MULT, 1, speed);
5723       return cost;
5724     }
5725 }
5726
5727 static int
5728 aarch64_address_cost (rtx x,
5729                       machine_mode mode,
5730                       addr_space_t as ATTRIBUTE_UNUSED,
5731                       bool speed)
5732 {
5733   enum rtx_code c = GET_CODE (x);
5734   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5735   struct aarch64_address_info info;
5736   int cost = 0;
5737   info.shift = 0;
5738
5739   if (!aarch64_classify_address (&info, x, mode, c, false))
5740     {
5741       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5742         {
5743           /* This is a CONST or SYMBOL ref which will be split
5744              in a different way depending on the code model in use.
5745              Cost it through the generic infrastructure.  */
5746           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5747           /* Divide through by the cost of one instruction to
5748              bring it to the same units as the address costs.  */
5749           cost_symbol_ref /= COSTS_N_INSNS (1);
5750           /* The cost is then the cost of preparing the address,
5751              followed by an immediate (possibly 0) offset.  */
5752           return cost_symbol_ref + addr_cost->imm_offset;
5753         }
5754       else
5755         {
5756           /* This is most likely a jump table from a case
5757              statement.  */
5758           return addr_cost->register_offset;
5759         }
5760     }
5761
5762   switch (info.type)
5763     {
5764       case ADDRESS_LO_SUM:
5765       case ADDRESS_SYMBOLIC:
5766       case ADDRESS_REG_IMM:
5767         cost += addr_cost->imm_offset;
5768         break;
5769
5770       case ADDRESS_REG_WB:
5771         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5772           cost += addr_cost->pre_modify;
5773         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5774           cost += addr_cost->post_modify;
5775         else
5776           gcc_unreachable ();
5777
5778         break;
5779
5780       case ADDRESS_REG_REG:
5781         cost += addr_cost->register_offset;
5782         break;
5783
5784       case ADDRESS_REG_SXTW:
5785         cost += addr_cost->register_sextend;
5786         break;
5787
5788       case ADDRESS_REG_UXTW:
5789         cost += addr_cost->register_zextend;
5790         break;
5791
5792       default:
5793         gcc_unreachable ();
5794     }
5795
5796
5797   if (info.shift > 0)
5798     {
5799       /* For the sake of calculating the cost of the shifted register
5800          component, we can treat same sized modes in the same way.  */
5801       switch (GET_MODE_BITSIZE (mode))
5802         {
5803           case 16:
5804             cost += addr_cost->addr_scale_costs.hi;
5805             break;
5806
5807           case 32:
5808             cost += addr_cost->addr_scale_costs.si;
5809             break;
5810
5811           case 64:
5812             cost += addr_cost->addr_scale_costs.di;
5813             break;
5814
5815           /* We can't tell, or this is a 128-bit vector.  */
5816           default:
5817             cost += addr_cost->addr_scale_costs.ti;
5818             break;
5819         }
5820     }
5821
5822   return cost;
5823 }
5824
5825 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
5826    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
5827    to be taken.  */
5828
5829 int
5830 aarch64_branch_cost (bool speed_p, bool predictable_p)
5831 {
5832   /* When optimizing for speed, use the cost of unpredictable branches.  */
5833   const struct cpu_branch_cost *branch_costs =
5834     aarch64_tune_params.branch_costs;
5835
5836   if (!speed_p || predictable_p)
5837     return branch_costs->predictable;
5838   else
5839     return branch_costs->unpredictable;
5840 }
5841
5842 /* Return true if the RTX X in mode MODE is a zero or sign extract
5843    usable in an ADD or SUB (extended register) instruction.  */
5844 static bool
5845 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5846 {
5847   /* Catch add with a sign extract.
5848      This is add_<optab><mode>_multp2.  */
5849   if (GET_CODE (x) == SIGN_EXTRACT
5850       || GET_CODE (x) == ZERO_EXTRACT)
5851     {
5852       rtx op0 = XEXP (x, 0);
5853       rtx op1 = XEXP (x, 1);
5854       rtx op2 = XEXP (x, 2);
5855
5856       if (GET_CODE (op0) == MULT
5857           && CONST_INT_P (op1)
5858           && op2 == const0_rtx
5859           && CONST_INT_P (XEXP (op0, 1))
5860           && aarch64_is_extend_from_extract (mode,
5861                                              XEXP (op0, 1),
5862                                              op1))
5863         {
5864           return true;
5865         }
5866     }
5867   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
5868      No shift.  */
5869   else if (GET_CODE (x) == SIGN_EXTEND
5870            || GET_CODE (x) == ZERO_EXTEND)
5871     return REG_P (XEXP (x, 0));
5872
5873   return false;
5874 }
5875
5876 static bool
5877 aarch64_frint_unspec_p (unsigned int u)
5878 {
5879   switch (u)
5880     {
5881       case UNSPEC_FRINTZ:
5882       case UNSPEC_FRINTP:
5883       case UNSPEC_FRINTM:
5884       case UNSPEC_FRINTA:
5885       case UNSPEC_FRINTN:
5886       case UNSPEC_FRINTX:
5887       case UNSPEC_FRINTI:
5888         return true;
5889
5890       default:
5891         return false;
5892     }
5893 }
5894
5895 /* Return true iff X is an rtx that will match an extr instruction
5896    i.e. as described in the *extr<mode>5_insn family of patterns.
5897    OP0 and OP1 will be set to the operands of the shifts involved
5898    on success and will be NULL_RTX otherwise.  */
5899
5900 static bool
5901 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5902 {
5903   rtx op0, op1;
5904   machine_mode mode = GET_MODE (x);
5905
5906   *res_op0 = NULL_RTX;
5907   *res_op1 = NULL_RTX;
5908
5909   if (GET_CODE (x) != IOR)
5910     return false;
5911
5912   op0 = XEXP (x, 0);
5913   op1 = XEXP (x, 1);
5914
5915   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5916       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5917     {
5918      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
5919       if (GET_CODE (op1) == ASHIFT)
5920         std::swap (op0, op1);
5921
5922       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5923         return false;
5924
5925       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5926       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5927
5928       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5929           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5930         {
5931           *res_op0 = XEXP (op0, 0);
5932           *res_op1 = XEXP (op1, 0);
5933           return true;
5934         }
5935     }
5936
5937   return false;
5938 }
5939
5940 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5941    storing it in *COST.  Result is true if the total cost of the operation
5942    has now been calculated.  */
5943 static bool
5944 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5945 {
5946   rtx inner;
5947   rtx comparator;
5948   enum rtx_code cmpcode;
5949
5950   if (COMPARISON_P (op0))
5951     {
5952       inner = XEXP (op0, 0);
5953       comparator = XEXP (op0, 1);
5954       cmpcode = GET_CODE (op0);
5955     }
5956   else
5957     {
5958       inner = op0;
5959       comparator = const0_rtx;
5960       cmpcode = NE;
5961     }
5962
5963   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5964     {
5965       /* Conditional branch.  */
5966       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5967         return true;
5968       else
5969         {
5970           if (cmpcode == NE || cmpcode == EQ)
5971             {
5972               if (comparator == const0_rtx)
5973                 {
5974                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5975                   if (GET_CODE (inner) == ZERO_EXTRACT)
5976                     /* TBZ/TBNZ.  */
5977                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
5978                                        ZERO_EXTRACT, 0, speed);
5979                   else
5980                     /* CBZ/CBNZ.  */
5981                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
5982
5983                 return true;
5984               }
5985             }
5986           else if (cmpcode == LT || cmpcode == GE)
5987             {
5988               /* TBZ/TBNZ.  */
5989               if (comparator == const0_rtx)
5990                 return true;
5991             }
5992         }
5993     }
5994   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5995     {
5996       /* CCMP.  */
5997       if (GET_CODE (op1) == COMPARE)
5998         {
5999           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6000           if (XEXP (op1, 1) == const0_rtx)
6001             *cost += 1;
6002           if (speed)
6003             {
6004               machine_mode mode = GET_MODE (XEXP (op1, 0));
6005               const struct cpu_cost_table *extra_cost
6006                 = aarch64_tune_params.insn_extra_cost;
6007
6008               if (GET_MODE_CLASS (mode) == MODE_INT)
6009                 *cost += extra_cost->alu.arith;
6010               else
6011                 *cost += extra_cost->fp[mode == DFmode].compare;
6012             }
6013           return true;
6014         }
6015
6016       /* It's a conditional operation based on the status flags,
6017          so it must be some flavor of CSEL.  */
6018
6019       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6020       if (GET_CODE (op1) == NEG
6021           || GET_CODE (op1) == NOT
6022           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6023         op1 = XEXP (op1, 0);
6024       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6025         {
6026           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6027           op1 = XEXP (op1, 0);
6028           op2 = XEXP (op2, 0);
6029         }
6030
6031       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6032       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6033       return true;
6034     }
6035
6036   /* We don't know what this is, cost all operands.  */
6037   return false;
6038 }
6039
6040 /* Check whether X is a bitfield operation of the form shift + extend that
6041    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6042    operand to which the bitfield operation is applied.  Otherwise return
6043    NULL_RTX.  */
6044
6045 static rtx
6046 aarch64_extend_bitfield_pattern_p (rtx x)
6047 {
6048   rtx_code outer_code = GET_CODE (x);
6049   machine_mode outer_mode = GET_MODE (x);
6050
6051   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6052       && outer_mode != SImode && outer_mode != DImode)
6053     return NULL_RTX;
6054
6055   rtx inner = XEXP (x, 0);
6056   rtx_code inner_code = GET_CODE (inner);
6057   machine_mode inner_mode = GET_MODE (inner);
6058   rtx op = NULL_RTX;
6059
6060   switch (inner_code)
6061     {
6062       case ASHIFT:
6063         if (CONST_INT_P (XEXP (inner, 1))
6064             && (inner_mode == QImode || inner_mode == HImode))
6065           op = XEXP (inner, 0);
6066         break;
6067       case LSHIFTRT:
6068         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6069             && (inner_mode == QImode || inner_mode == HImode))
6070           op = XEXP (inner, 0);
6071         break;
6072       case ASHIFTRT:
6073         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6074             && (inner_mode == QImode || inner_mode == HImode))
6075           op = XEXP (inner, 0);
6076         break;
6077       default:
6078         break;
6079     }
6080
6081   return op;
6082 }
6083
6084 /* Return true if the mask and a shift amount from an RTX of the form
6085    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6086    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6087
6088 bool
6089 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6090 {
6091   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6092          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6093          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6094          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6095 }
6096
6097 /* Calculate the cost of calculating X, storing it in *COST.  Result
6098    is true if the total cost of the operation has now been calculated.  */
6099 static bool
6100 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6101                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6102 {
6103   rtx op0, op1, op2;
6104   const struct cpu_cost_table *extra_cost
6105     = aarch64_tune_params.insn_extra_cost;
6106   int code = GET_CODE (x);
6107
6108   /* By default, assume that everything has equivalent cost to the
6109      cheapest instruction.  Any additional costs are applied as a delta
6110      above this default.  */
6111   *cost = COSTS_N_INSNS (1);
6112
6113   switch (code)
6114     {
6115     case SET:
6116       /* The cost depends entirely on the operands to SET.  */
6117       *cost = 0;
6118       op0 = SET_DEST (x);
6119       op1 = SET_SRC (x);
6120
6121       switch (GET_CODE (op0))
6122         {
6123         case MEM:
6124           if (speed)
6125             {
6126               rtx address = XEXP (op0, 0);
6127               if (VECTOR_MODE_P (mode))
6128                 *cost += extra_cost->ldst.storev;
6129               else if (GET_MODE_CLASS (mode) == MODE_INT)
6130                 *cost += extra_cost->ldst.store;
6131               else if (mode == SFmode)
6132                 *cost += extra_cost->ldst.storef;
6133               else if (mode == DFmode)
6134                 *cost += extra_cost->ldst.stored;
6135
6136               *cost +=
6137                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6138                                                      0, speed));
6139             }
6140
6141           *cost += rtx_cost (op1, mode, SET, 1, speed);
6142           return true;
6143
6144         case SUBREG:
6145           if (! REG_P (SUBREG_REG (op0)))
6146             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6147
6148           /* Fall through.  */
6149         case REG:
6150           /* The cost is one per vector-register copied.  */
6151           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6152             {
6153               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6154                               / GET_MODE_SIZE (V4SImode);
6155               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6156             }
6157           /* const0_rtx is in general free, but we will use an
6158              instruction to set a register to 0.  */
6159           else if (REG_P (op1) || op1 == const0_rtx)
6160             {
6161               /* The cost is 1 per register copied.  */
6162               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6163                               / UNITS_PER_WORD;
6164               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6165             }
6166           else
6167             /* Cost is just the cost of the RHS of the set.  */
6168             *cost += rtx_cost (op1, mode, SET, 1, speed);
6169           return true;
6170
6171         case ZERO_EXTRACT:
6172         case SIGN_EXTRACT:
6173           /* Bit-field insertion.  Strip any redundant widening of
6174              the RHS to meet the width of the target.  */
6175           if (GET_CODE (op1) == SUBREG)
6176             op1 = SUBREG_REG (op1);
6177           if ((GET_CODE (op1) == ZERO_EXTEND
6178                || GET_CODE (op1) == SIGN_EXTEND)
6179               && CONST_INT_P (XEXP (op0, 1))
6180               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6181                   >= INTVAL (XEXP (op0, 1))))
6182             op1 = XEXP (op1, 0);
6183
6184           if (CONST_INT_P (op1))
6185             {
6186               /* MOV immediate is assumed to always be cheap.  */
6187               *cost = COSTS_N_INSNS (1);
6188             }
6189           else
6190             {
6191               /* BFM.  */
6192               if (speed)
6193                 *cost += extra_cost->alu.bfi;
6194               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6195             }
6196
6197           return true;
6198
6199         default:
6200           /* We can't make sense of this, assume default cost.  */
6201           *cost = COSTS_N_INSNS (1);
6202           return false;
6203         }
6204       return false;
6205
6206     case CONST_INT:
6207       /* If an instruction can incorporate a constant within the
6208          instruction, the instruction's expression avoids calling
6209          rtx_cost() on the constant.  If rtx_cost() is called on a
6210          constant, then it is usually because the constant must be
6211          moved into a register by one or more instructions.
6212
6213          The exception is constant 0, which can be expressed
6214          as XZR/WZR and is therefore free.  The exception to this is
6215          if we have (set (reg) (const0_rtx)) in which case we must cost
6216          the move.  However, we can catch that when we cost the SET, so
6217          we don't need to consider that here.  */
6218       if (x == const0_rtx)
6219         *cost = 0;
6220       else
6221         {
6222           /* To an approximation, building any other constant is
6223              proportionally expensive to the number of instructions
6224              required to build that constant.  This is true whether we
6225              are compiling for SPEED or otherwise.  */
6226           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6227                                  (NULL_RTX, x, false, mode));
6228         }
6229       return true;
6230
6231     case CONST_DOUBLE:
6232       if (speed)
6233         {
6234           /* mov[df,sf]_aarch64.  */
6235           if (aarch64_float_const_representable_p (x))
6236             /* FMOV (scalar immediate).  */
6237             *cost += extra_cost->fp[mode == DFmode].fpconst;
6238           else if (!aarch64_float_const_zero_rtx_p (x))
6239             {
6240               /* This will be a load from memory.  */
6241               if (mode == DFmode)
6242                 *cost += extra_cost->ldst.loadd;
6243               else
6244                 *cost += extra_cost->ldst.loadf;
6245             }
6246           else
6247             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
6248                or MOV v0.s[0], wzr - neither of which are modeled by the
6249                cost tables.  Just use the default cost.  */
6250             {
6251             }
6252         }
6253
6254       return true;
6255
6256     case MEM:
6257       if (speed)
6258         {
6259           /* For loads we want the base cost of a load, plus an
6260              approximation for the additional cost of the addressing
6261              mode.  */
6262           rtx address = XEXP (x, 0);
6263           if (VECTOR_MODE_P (mode))
6264             *cost += extra_cost->ldst.loadv;
6265           else if (GET_MODE_CLASS (mode) == MODE_INT)
6266             *cost += extra_cost->ldst.load;
6267           else if (mode == SFmode)
6268             *cost += extra_cost->ldst.loadf;
6269           else if (mode == DFmode)
6270             *cost += extra_cost->ldst.loadd;
6271
6272           *cost +=
6273                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6274                                                      0, speed));
6275         }
6276
6277       return true;
6278
6279     case NEG:
6280       op0 = XEXP (x, 0);
6281
6282       if (VECTOR_MODE_P (mode))
6283         {
6284           if (speed)
6285             {
6286               /* FNEG.  */
6287               *cost += extra_cost->vect.alu;
6288             }
6289           return false;
6290         }
6291
6292       if (GET_MODE_CLASS (mode) == MODE_INT)
6293         {
6294           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6295               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6296             {
6297               /* CSETM.  */
6298               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6299               return true;
6300             }
6301
6302           /* Cost this as SUB wzr, X.  */
6303           op0 = CONST0_RTX (mode);
6304           op1 = XEXP (x, 0);
6305           goto cost_minus;
6306         }
6307
6308       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6309         {
6310           /* Support (neg(fma...)) as a single instruction only if
6311              sign of zeros is unimportant.  This matches the decision
6312              making in aarch64.md.  */
6313           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6314             {
6315               /* FNMADD.  */
6316               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6317               return true;
6318             }
6319           if (GET_CODE (op0) == MULT)
6320             {
6321               /* FNMUL.  */
6322               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6323               return true;
6324             }
6325           if (speed)
6326             /* FNEG.  */
6327             *cost += extra_cost->fp[mode == DFmode].neg;
6328           return false;
6329         }
6330
6331       return false;
6332
6333     case CLRSB:
6334     case CLZ:
6335       if (speed)
6336         {
6337           if (VECTOR_MODE_P (mode))
6338             *cost += extra_cost->vect.alu;
6339           else
6340             *cost += extra_cost->alu.clz;
6341         }
6342
6343       return false;
6344
6345     case COMPARE:
6346       op0 = XEXP (x, 0);
6347       op1 = XEXP (x, 1);
6348
6349       if (op1 == const0_rtx
6350           && GET_CODE (op0) == AND)
6351         {
6352           x = op0;
6353           mode = GET_MODE (op0);
6354           goto cost_logic;
6355         }
6356
6357       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6358         {
6359           /* TODO: A write to the CC flags possibly costs extra, this
6360              needs encoding in the cost tables.  */
6361
6362           mode = GET_MODE (op0);
6363           /* ANDS.  */
6364           if (GET_CODE (op0) == AND)
6365             {
6366               x = op0;
6367               goto cost_logic;
6368             }
6369
6370           if (GET_CODE (op0) == PLUS)
6371             {
6372               /* ADDS (and CMN alias).  */
6373               x = op0;
6374               goto cost_plus;
6375             }
6376
6377           if (GET_CODE (op0) == MINUS)
6378             {
6379               /* SUBS.  */
6380               x = op0;
6381               goto cost_minus;
6382             }
6383
6384           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6385               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6386               && CONST_INT_P (XEXP (op0, 2)))
6387             {
6388               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6389                  Handle it here directly rather than going to cost_logic
6390                  since we know the immediate generated for the TST is valid
6391                  so we can avoid creating an intermediate rtx for it only
6392                  for costing purposes.  */
6393               if (speed)
6394                 *cost += extra_cost->alu.logical;
6395
6396               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6397                                  ZERO_EXTRACT, 0, speed);
6398               return true;
6399             }
6400
6401           if (GET_CODE (op1) == NEG)
6402             {
6403               /* CMN.  */
6404               if (speed)
6405                 *cost += extra_cost->alu.arith;
6406
6407               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6408               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6409               return true;
6410             }
6411
6412           /* CMP.
6413
6414              Compare can freely swap the order of operands, and
6415              canonicalization puts the more complex operation first.
6416              But the integer MINUS logic expects the shift/extend
6417              operation in op1.  */
6418           if (! (REG_P (op0)
6419                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6420           {
6421             op0 = XEXP (x, 1);
6422             op1 = XEXP (x, 0);
6423           }
6424           goto cost_minus;
6425         }
6426
6427       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6428         {
6429           /* FCMP.  */
6430           if (speed)
6431             *cost += extra_cost->fp[mode == DFmode].compare;
6432
6433           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6434             {
6435               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6436               /* FCMP supports constant 0.0 for no extra cost. */
6437               return true;
6438             }
6439           return false;
6440         }
6441
6442       if (VECTOR_MODE_P (mode))
6443         {
6444           /* Vector compare.  */
6445           if (speed)
6446             *cost += extra_cost->vect.alu;
6447
6448           if (aarch64_float_const_zero_rtx_p (op1))
6449             {
6450               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6451                  cost.  */
6452               return true;
6453             }
6454           return false;
6455         }
6456       return false;
6457
6458     case MINUS:
6459       {
6460         op0 = XEXP (x, 0);
6461         op1 = XEXP (x, 1);
6462
6463 cost_minus:
6464         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6465
6466         /* Detect valid immediates.  */
6467         if ((GET_MODE_CLASS (mode) == MODE_INT
6468              || (GET_MODE_CLASS (mode) == MODE_CC
6469                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6470             && CONST_INT_P (op1)
6471             && aarch64_uimm12_shift (INTVAL (op1)))
6472           {
6473             if (speed)
6474               /* SUB(S) (immediate).  */
6475               *cost += extra_cost->alu.arith;
6476             return true;
6477           }
6478
6479         /* Look for SUB (extended register).  */
6480         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6481           {
6482             if (speed)
6483               *cost += extra_cost->alu.extend_arith;
6484
6485             op1 = aarch64_strip_extend (op1);
6486             *cost += rtx_cost (op1, VOIDmode,
6487                                (enum rtx_code) GET_CODE (op1), 0, speed);
6488             return true;
6489           }
6490
6491         rtx new_op1 = aarch64_strip_extend (op1);
6492
6493         /* Cost this as an FMA-alike operation.  */
6494         if ((GET_CODE (new_op1) == MULT
6495              || aarch64_shift_p (GET_CODE (new_op1)))
6496             && code != COMPARE)
6497           {
6498             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6499                                             (enum rtx_code) code,
6500                                             speed);
6501             return true;
6502           }
6503
6504         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6505
6506         if (speed)
6507           {
6508             if (VECTOR_MODE_P (mode))
6509               {
6510                 /* Vector SUB.  */
6511                 *cost += extra_cost->vect.alu;
6512               }
6513             else if (GET_MODE_CLASS (mode) == MODE_INT)
6514               {
6515                 /* SUB(S).  */
6516                 *cost += extra_cost->alu.arith;
6517               }
6518             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6519               {
6520                 /* FSUB.  */
6521                 *cost += extra_cost->fp[mode == DFmode].addsub;
6522               }
6523           }
6524         return true;
6525       }
6526
6527     case PLUS:
6528       {
6529         rtx new_op0;
6530
6531         op0 = XEXP (x, 0);
6532         op1 = XEXP (x, 1);
6533
6534 cost_plus:
6535         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6536             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6537           {
6538             /* CSINC.  */
6539             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6540             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6541             return true;
6542           }
6543
6544         if (GET_MODE_CLASS (mode) == MODE_INT
6545             && CONST_INT_P (op1)
6546             && aarch64_uimm12_shift (INTVAL (op1)))
6547           {
6548             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6549
6550             if (speed)
6551               /* ADD (immediate).  */
6552               *cost += extra_cost->alu.arith;
6553             return true;
6554           }
6555
6556         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6557
6558         /* Look for ADD (extended register).  */
6559         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6560           {
6561             if (speed)
6562               *cost += extra_cost->alu.extend_arith;
6563
6564             op0 = aarch64_strip_extend (op0);
6565             *cost += rtx_cost (op0, VOIDmode,
6566                                (enum rtx_code) GET_CODE (op0), 0, speed);
6567             return true;
6568           }
6569
6570         /* Strip any extend, leave shifts behind as we will
6571            cost them through mult_cost.  */
6572         new_op0 = aarch64_strip_extend (op0);
6573
6574         if (GET_CODE (new_op0) == MULT
6575             || aarch64_shift_p (GET_CODE (new_op0)))
6576           {
6577             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6578                                             speed);
6579             return true;
6580           }
6581
6582         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6583
6584         if (speed)
6585           {
6586             if (VECTOR_MODE_P (mode))
6587               {
6588                 /* Vector ADD.  */
6589                 *cost += extra_cost->vect.alu;
6590               }
6591             else if (GET_MODE_CLASS (mode) == MODE_INT)
6592               {
6593                 /* ADD.  */
6594                 *cost += extra_cost->alu.arith;
6595               }
6596             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6597               {
6598                 /* FADD.  */
6599                 *cost += extra_cost->fp[mode == DFmode].addsub;
6600               }
6601           }
6602         return true;
6603       }
6604
6605     case BSWAP:
6606       *cost = COSTS_N_INSNS (1);
6607
6608       if (speed)
6609         {
6610           if (VECTOR_MODE_P (mode))
6611             *cost += extra_cost->vect.alu;
6612           else
6613             *cost += extra_cost->alu.rev;
6614         }
6615       return false;
6616
6617     case IOR:
6618       if (aarch_rev16_p (x))
6619         {
6620           *cost = COSTS_N_INSNS (1);
6621
6622           if (speed)
6623             {
6624               if (VECTOR_MODE_P (mode))
6625                 *cost += extra_cost->vect.alu;
6626               else
6627                 *cost += extra_cost->alu.rev;
6628             }
6629           return true;
6630         }
6631
6632       if (aarch64_extr_rtx_p (x, &op0, &op1))
6633         {
6634           *cost += rtx_cost (op0, mode, IOR, 0, speed);
6635           *cost += rtx_cost (op1, mode, IOR, 1, speed);
6636           if (speed)
6637             *cost += extra_cost->alu.shift;
6638
6639           return true;
6640         }
6641     /* Fall through.  */
6642     case XOR:
6643     case AND:
6644     cost_logic:
6645       op0 = XEXP (x, 0);
6646       op1 = XEXP (x, 1);
6647
6648       if (VECTOR_MODE_P (mode))
6649         {
6650           if (speed)
6651             *cost += extra_cost->vect.alu;
6652           return true;
6653         }
6654
6655       if (code == AND
6656           && GET_CODE (op0) == MULT
6657           && CONST_INT_P (XEXP (op0, 1))
6658           && CONST_INT_P (op1)
6659           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6660                                INTVAL (op1)) != 0)
6661         {
6662           /* This is a UBFM/SBFM.  */
6663           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6664           if (speed)
6665             *cost += extra_cost->alu.bfx;
6666           return true;
6667         }
6668
6669       if (GET_MODE_CLASS (mode) == MODE_INT)
6670         {
6671           if (CONST_INT_P (op1))
6672             {
6673               /* We have a mask + shift version of a UBFIZ
6674                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
6675               if (GET_CODE (op0) == ASHIFT
6676                   && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
6677                                                           XEXP (op0, 1)))
6678                 {
6679                   *cost += rtx_cost (XEXP (op0, 0), mode,
6680                                      (enum rtx_code) code, 0, speed);
6681                   if (speed)
6682                     *cost += extra_cost->alu.bfx;
6683
6684                   return true;
6685                 }
6686               else if (aarch64_bitmask_imm (INTVAL (op1), mode))
6687                 {
6688                 /* We possibly get the immediate for free, this is not
6689                    modelled.  */
6690                   *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6691                   if (speed)
6692                     *cost += extra_cost->alu.logical;
6693
6694                   return true;
6695                 }
6696             }
6697           else
6698             {
6699               rtx new_op0 = op0;
6700
6701               /* Handle ORN, EON, or BIC.  */
6702               if (GET_CODE (op0) == NOT)
6703                 op0 = XEXP (op0, 0);
6704
6705               new_op0 = aarch64_strip_shift (op0);
6706
6707               /* If we had a shift on op0 then this is a logical-shift-
6708                  by-register/immediate operation.  Otherwise, this is just
6709                  a logical operation.  */
6710               if (speed)
6711                 {
6712                   if (new_op0 != op0)
6713                     {
6714                       /* Shift by immediate.  */
6715                       if (CONST_INT_P (XEXP (op0, 1)))
6716                         *cost += extra_cost->alu.log_shift;
6717                       else
6718                         *cost += extra_cost->alu.log_shift_reg;
6719                     }
6720                   else
6721                     *cost += extra_cost->alu.logical;
6722                 }
6723
6724               /* In both cases we want to cost both operands.  */
6725               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6726               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6727
6728               return true;
6729             }
6730         }
6731       return false;
6732
6733     case NOT:
6734       x = XEXP (x, 0);
6735       op0 = aarch64_strip_shift (x);
6736
6737       if (VECTOR_MODE_P (mode))
6738         {
6739           /* Vector NOT.  */
6740           *cost += extra_cost->vect.alu;
6741           return false;
6742         }
6743
6744       /* MVN-shifted-reg.  */
6745       if (op0 != x)
6746         {
6747           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6748
6749           if (speed)
6750             *cost += extra_cost->alu.log_shift;
6751
6752           return true;
6753         }
6754       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6755          Handle the second form here taking care that 'a' in the above can
6756          be a shift.  */
6757       else if (GET_CODE (op0) == XOR)
6758         {
6759           rtx newop0 = XEXP (op0, 0);
6760           rtx newop1 = XEXP (op0, 1);
6761           rtx op0_stripped = aarch64_strip_shift (newop0);
6762
6763           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6764           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6765
6766           if (speed)
6767             {
6768               if (op0_stripped != newop0)
6769                 *cost += extra_cost->alu.log_shift;
6770               else
6771                 *cost += extra_cost->alu.logical;
6772             }
6773
6774           return true;
6775         }
6776       /* MVN.  */
6777       if (speed)
6778         *cost += extra_cost->alu.logical;
6779
6780       return false;
6781
6782     case ZERO_EXTEND:
6783
6784       op0 = XEXP (x, 0);
6785       /* If a value is written in SI mode, then zero extended to DI
6786          mode, the operation will in general be free as a write to
6787          a 'w' register implicitly zeroes the upper bits of an 'x'
6788          register.  However, if this is
6789
6790            (set (reg) (zero_extend (reg)))
6791
6792          we must cost the explicit register move.  */
6793       if (mode == DImode
6794           && GET_MODE (op0) == SImode
6795           && outer == SET)
6796         {
6797           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6798
6799         /* If OP_COST is non-zero, then the cost of the zero extend
6800            is effectively the cost of the inner operation.  Otherwise
6801            we have a MOV instruction and we take the cost from the MOV
6802            itself.  This is true independently of whether we are
6803            optimizing for space or time.  */
6804           if (op_cost)
6805             *cost = op_cost;
6806
6807           return true;
6808         }
6809       else if (MEM_P (op0))
6810         {
6811           /* All loads can zero extend to any size for free.  */
6812           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6813           return true;
6814         }
6815
6816       op0 = aarch64_extend_bitfield_pattern_p (x);
6817       if (op0)
6818         {
6819           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
6820           if (speed)
6821             *cost += extra_cost->alu.bfx;
6822           return true;
6823         }
6824
6825       if (speed)
6826         {
6827           if (VECTOR_MODE_P (mode))
6828             {
6829               /* UMOV.  */
6830               *cost += extra_cost->vect.alu;
6831             }
6832           else
6833             {
6834               /* We generate an AND instead of UXTB/UXTH.  */
6835               *cost += extra_cost->alu.logical;
6836             }
6837         }
6838       return false;
6839
6840     case SIGN_EXTEND:
6841       if (MEM_P (XEXP (x, 0)))
6842         {
6843           /* LDRSH.  */
6844           if (speed)
6845             {
6846               rtx address = XEXP (XEXP (x, 0), 0);
6847               *cost += extra_cost->ldst.load_sign_extend;
6848
6849               *cost +=
6850                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6851                                                      0, speed));
6852             }
6853           return true;
6854         }
6855
6856       op0 = aarch64_extend_bitfield_pattern_p (x);
6857       if (op0)
6858         {
6859           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
6860           if (speed)
6861             *cost += extra_cost->alu.bfx;
6862           return true;
6863         }
6864
6865       if (speed)
6866         {
6867           if (VECTOR_MODE_P (mode))
6868             *cost += extra_cost->vect.alu;
6869           else
6870             *cost += extra_cost->alu.extend;
6871         }
6872       return false;
6873
6874     case ASHIFT:
6875       op0 = XEXP (x, 0);
6876       op1 = XEXP (x, 1);
6877
6878       if (CONST_INT_P (op1))
6879         {
6880           if (speed)
6881             {
6882               if (VECTOR_MODE_P (mode))
6883                 {
6884                   /* Vector shift (immediate).  */
6885                   *cost += extra_cost->vect.alu;
6886                 }
6887               else
6888                 {
6889                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6890                      aliases.  */
6891                   *cost += extra_cost->alu.shift;
6892                 }
6893             }
6894
6895           /* We can incorporate zero/sign extend for free.  */
6896           if (GET_CODE (op0) == ZERO_EXTEND
6897               || GET_CODE (op0) == SIGN_EXTEND)
6898             op0 = XEXP (op0, 0);
6899
6900           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
6901           return true;
6902         }
6903       else
6904         {
6905           if (speed)
6906             {
6907               if (VECTOR_MODE_P (mode))
6908                 {
6909                   /* Vector shift (register).  */
6910                   *cost += extra_cost->vect.alu;
6911                 }
6912               else
6913                 {
6914                   /* LSLV.  */
6915                   *cost += extra_cost->alu.shift_reg;
6916                 }
6917             }
6918           return false;  /* All arguments need to be in registers.  */
6919         }
6920
6921     case ROTATE:
6922     case ROTATERT:
6923     case LSHIFTRT:
6924     case ASHIFTRT:
6925       op0 = XEXP (x, 0);
6926       op1 = XEXP (x, 1);
6927
6928       if (CONST_INT_P (op1))
6929         {
6930           /* ASR (immediate) and friends.  */
6931           if (speed)
6932             {
6933               if (VECTOR_MODE_P (mode))
6934                 *cost += extra_cost->vect.alu;
6935               else
6936                 *cost += extra_cost->alu.shift;
6937             }
6938
6939           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6940           return true;
6941         }
6942       else
6943         {
6944
6945           /* ASR (register) and friends.  */
6946           if (speed)
6947             {
6948               if (VECTOR_MODE_P (mode))
6949                 *cost += extra_cost->vect.alu;
6950               else
6951                 *cost += extra_cost->alu.shift_reg;
6952             }
6953           return false;  /* All arguments need to be in registers.  */
6954         }
6955
6956     case SYMBOL_REF:
6957
6958       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
6959           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
6960         {
6961           /* LDR.  */
6962           if (speed)
6963             *cost += extra_cost->ldst.load;
6964         }
6965       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6966                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6967         {
6968           /* ADRP, followed by ADD.  */
6969           *cost += COSTS_N_INSNS (1);
6970           if (speed)
6971             *cost += 2 * extra_cost->alu.arith;
6972         }
6973       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6974                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6975         {
6976           /* ADR.  */
6977           if (speed)
6978             *cost += extra_cost->alu.arith;
6979         }
6980
6981       if (flag_pic)
6982         {
6983           /* One extra load instruction, after accessing the GOT.  */
6984           *cost += COSTS_N_INSNS (1);
6985           if (speed)
6986             *cost += extra_cost->ldst.load;
6987         }
6988       return true;
6989
6990     case HIGH:
6991     case LO_SUM:
6992       /* ADRP/ADD (immediate).  */
6993       if (speed)
6994         *cost += extra_cost->alu.arith;
6995       return true;
6996
6997     case ZERO_EXTRACT:
6998     case SIGN_EXTRACT:
6999       /* UBFX/SBFX.  */
7000       if (speed)
7001         {
7002           if (VECTOR_MODE_P (mode))
7003             *cost += extra_cost->vect.alu;
7004           else
7005             *cost += extra_cost->alu.bfx;
7006         }
7007
7008       /* We can trust that the immediates used will be correct (there
7009          are no by-register forms), so we need only cost op0.  */
7010       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7011       return true;
7012
7013     case MULT:
7014       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7015       /* aarch64_rtx_mult_cost always handles recursion to its
7016          operands.  */
7017       return true;
7018
7019     case MOD:
7020     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7021        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7022        an unconditional negate.  This case should only ever be reached through
7023        the set_smod_pow2_cheap check in expmed.c.  */
7024       if (CONST_INT_P (XEXP (x, 1))
7025           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7026           && (mode == SImode || mode == DImode))
7027         {
7028           /* We expand to 4 instructions.  Reset the baseline.  */
7029           *cost = COSTS_N_INSNS (4);
7030
7031           if (speed)
7032             *cost += 2 * extra_cost->alu.logical
7033                      + 2 * extra_cost->alu.arith;
7034
7035           return true;
7036         }
7037
7038     /* Fall-through.  */
7039     case UMOD:
7040       if (speed)
7041         {
7042           if (VECTOR_MODE_P (mode))
7043             *cost += extra_cost->vect.alu;
7044           else if (GET_MODE_CLASS (mode) == MODE_INT)
7045             *cost += (extra_cost->mult[mode == DImode].add
7046                       + extra_cost->mult[mode == DImode].idiv);
7047           else if (mode == DFmode)
7048             *cost += (extra_cost->fp[1].mult
7049                       + extra_cost->fp[1].div);
7050           else if (mode == SFmode)
7051             *cost += (extra_cost->fp[0].mult
7052                       + extra_cost->fp[0].div);
7053         }
7054       return false;  /* All arguments need to be in registers.  */
7055
7056     case DIV:
7057     case UDIV:
7058     case SQRT:
7059       if (speed)
7060         {
7061           if (VECTOR_MODE_P (mode))
7062             *cost += extra_cost->vect.alu;
7063           else if (GET_MODE_CLASS (mode) == MODE_INT)
7064             /* There is no integer SQRT, so only DIV and UDIV can get
7065                here.  */
7066             *cost += extra_cost->mult[mode == DImode].idiv;
7067           else
7068             *cost += extra_cost->fp[mode == DFmode].div;
7069         }
7070       return false;  /* All arguments need to be in registers.  */
7071
7072     case IF_THEN_ELSE:
7073       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7074                                          XEXP (x, 2), cost, speed);
7075
7076     case EQ:
7077     case NE:
7078     case GT:
7079     case GTU:
7080     case LT:
7081     case LTU:
7082     case GE:
7083     case GEU:
7084     case LE:
7085     case LEU:
7086
7087       return false; /* All arguments must be in registers.  */
7088
7089     case FMA:
7090       op0 = XEXP (x, 0);
7091       op1 = XEXP (x, 1);
7092       op2 = XEXP (x, 2);
7093
7094       if (speed)
7095         {
7096           if (VECTOR_MODE_P (mode))
7097             *cost += extra_cost->vect.alu;
7098           else
7099             *cost += extra_cost->fp[mode == DFmode].fma;
7100         }
7101
7102       /* FMSUB, FNMADD, and FNMSUB are free.  */
7103       if (GET_CODE (op0) == NEG)
7104         op0 = XEXP (op0, 0);
7105
7106       if (GET_CODE (op2) == NEG)
7107         op2 = XEXP (op2, 0);
7108
7109       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7110          and the by-element operand as operand 0.  */
7111       if (GET_CODE (op1) == NEG)
7112         op1 = XEXP (op1, 0);
7113
7114       /* Catch vector-by-element operations.  The by-element operand can
7115          either be (vec_duplicate (vec_select (x))) or just
7116          (vec_select (x)), depending on whether we are multiplying by
7117          a vector or a scalar.
7118
7119          Canonicalization is not very good in these cases, FMA4 will put the
7120          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7121       if (GET_CODE (op0) == VEC_DUPLICATE)
7122         op0 = XEXP (op0, 0);
7123       else if (GET_CODE (op1) == VEC_DUPLICATE)
7124         op1 = XEXP (op1, 0);
7125
7126       if (GET_CODE (op0) == VEC_SELECT)
7127         op0 = XEXP (op0, 0);
7128       else if (GET_CODE (op1) == VEC_SELECT)
7129         op1 = XEXP (op1, 0);
7130
7131       /* If the remaining parameters are not registers,
7132          get the cost to put them into registers.  */
7133       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7134       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7135       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7136       return true;
7137
7138     case FLOAT:
7139     case UNSIGNED_FLOAT:
7140       if (speed)
7141         *cost += extra_cost->fp[mode == DFmode].fromint;
7142       return false;
7143
7144     case FLOAT_EXTEND:
7145       if (speed)
7146         {
7147           if (VECTOR_MODE_P (mode))
7148             {
7149               /*Vector truncate.  */
7150               *cost += extra_cost->vect.alu;
7151             }
7152           else
7153             *cost += extra_cost->fp[mode == DFmode].widen;
7154         }
7155       return false;
7156
7157     case FLOAT_TRUNCATE:
7158       if (speed)
7159         {
7160           if (VECTOR_MODE_P (mode))
7161             {
7162               /*Vector conversion.  */
7163               *cost += extra_cost->vect.alu;
7164             }
7165           else
7166             *cost += extra_cost->fp[mode == DFmode].narrow;
7167         }
7168       return false;
7169
7170     case FIX:
7171     case UNSIGNED_FIX:
7172       x = XEXP (x, 0);
7173       /* Strip the rounding part.  They will all be implemented
7174          by the fcvt* family of instructions anyway.  */
7175       if (GET_CODE (x) == UNSPEC)
7176         {
7177           unsigned int uns_code = XINT (x, 1);
7178
7179           if (uns_code == UNSPEC_FRINTA
7180               || uns_code == UNSPEC_FRINTM
7181               || uns_code == UNSPEC_FRINTN
7182               || uns_code == UNSPEC_FRINTP
7183               || uns_code == UNSPEC_FRINTZ)
7184             x = XVECEXP (x, 0, 0);
7185         }
7186
7187       if (speed)
7188         {
7189           if (VECTOR_MODE_P (mode))
7190             *cost += extra_cost->vect.alu;
7191           else
7192             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7193         }
7194
7195       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7196          fixed-point fcvt.  */
7197       if (GET_CODE (x) == MULT
7198           && ((VECTOR_MODE_P (mode)
7199                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7200               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7201         {
7202           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7203                              0, speed);
7204           return true;
7205         }
7206
7207       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7208       return true;
7209
7210     case ABS:
7211       if (VECTOR_MODE_P (mode))
7212         {
7213           /* ABS (vector).  */
7214           if (speed)
7215             *cost += extra_cost->vect.alu;
7216         }
7217       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7218         {
7219           op0 = XEXP (x, 0);
7220
7221           /* FABD, which is analogous to FADD.  */
7222           if (GET_CODE (op0) == MINUS)
7223             {
7224               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7225               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7226               if (speed)
7227                 *cost += extra_cost->fp[mode == DFmode].addsub;
7228
7229               return true;
7230             }
7231           /* Simple FABS is analogous to FNEG.  */
7232           if (speed)
7233             *cost += extra_cost->fp[mode == DFmode].neg;
7234         }
7235       else
7236         {
7237           /* Integer ABS will either be split to
7238              two arithmetic instructions, or will be an ABS
7239              (scalar), which we don't model.  */
7240           *cost = COSTS_N_INSNS (2);
7241           if (speed)
7242             *cost += 2 * extra_cost->alu.arith;
7243         }
7244       return false;
7245
7246     case SMAX:
7247     case SMIN:
7248       if (speed)
7249         {
7250           if (VECTOR_MODE_P (mode))
7251             *cost += extra_cost->vect.alu;
7252           else
7253             {
7254               /* FMAXNM/FMINNM/FMAX/FMIN.
7255                  TODO: This may not be accurate for all implementations, but
7256                  we do not model this in the cost tables.  */
7257               *cost += extra_cost->fp[mode == DFmode].addsub;
7258             }
7259         }
7260       return false;
7261
7262     case UNSPEC:
7263       /* The floating point round to integer frint* instructions.  */
7264       if (aarch64_frint_unspec_p (XINT (x, 1)))
7265         {
7266           if (speed)
7267             *cost += extra_cost->fp[mode == DFmode].roundint;
7268
7269           return false;
7270         }
7271
7272       if (XINT (x, 1) == UNSPEC_RBIT)
7273         {
7274           if (speed)
7275             *cost += extra_cost->alu.rev;
7276
7277           return false;
7278         }
7279       break;
7280
7281     case TRUNCATE:
7282
7283       /* Decompose <su>muldi3_highpart.  */
7284       if (/* (truncate:DI  */
7285           mode == DImode
7286           /*   (lshiftrt:TI  */
7287           && GET_MODE (XEXP (x, 0)) == TImode
7288           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7289           /*      (mult:TI  */
7290           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7291           /*        (ANY_EXTEND:TI (reg:DI))
7292                     (ANY_EXTEND:TI (reg:DI)))  */
7293           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7294                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7295               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7296                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7297           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7298           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7299           /*     (const_int 64)  */
7300           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7301           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7302         {
7303           /* UMULH/SMULH.  */
7304           if (speed)
7305             *cost += extra_cost->mult[mode == DImode].extend;
7306           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7307                              mode, MULT, 0, speed);
7308           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7309                              mode, MULT, 1, speed);
7310           return true;
7311         }
7312
7313       /* Fall through.  */
7314     default:
7315       break;
7316     }
7317
7318   if (dump_file && (dump_flags & TDF_DETAILS))
7319     fprintf (dump_file,
7320       "\nFailed to cost RTX.  Assuming default cost.\n");
7321
7322   return true;
7323 }
7324
7325 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7326    calculated for X.  This cost is stored in *COST.  Returns true
7327    if the total cost of X was calculated.  */
7328 static bool
7329 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7330                    int param, int *cost, bool speed)
7331 {
7332   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7333
7334   if (dump_file && (dump_flags & TDF_DETAILS))
7335     {
7336       print_rtl_single (dump_file, x);
7337       fprintf (dump_file, "\n%s cost: %d (%s)\n",
7338                speed ? "Hot" : "Cold",
7339                *cost, result ? "final" : "partial");
7340     }
7341
7342   return result;
7343 }
7344
7345 static int
7346 aarch64_register_move_cost (machine_mode mode,
7347                             reg_class_t from_i, reg_class_t to_i)
7348 {
7349   enum reg_class from = (enum reg_class) from_i;
7350   enum reg_class to = (enum reg_class) to_i;
7351   const struct cpu_regmove_cost *regmove_cost
7352     = aarch64_tune_params.regmove_cost;
7353
7354   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
7355   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7356     to = GENERAL_REGS;
7357
7358   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7359     from = GENERAL_REGS;
7360
7361   /* Moving between GPR and stack cost is the same as GP2GP.  */
7362   if ((from == GENERAL_REGS && to == STACK_REG)
7363       || (to == GENERAL_REGS && from == STACK_REG))
7364     return regmove_cost->GP2GP;
7365
7366   /* To/From the stack register, we move via the gprs.  */
7367   if (to == STACK_REG || from == STACK_REG)
7368     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7369             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7370
7371   if (GET_MODE_SIZE (mode) == 16)
7372     {
7373       /* 128-bit operations on general registers require 2 instructions.  */
7374       if (from == GENERAL_REGS && to == GENERAL_REGS)
7375         return regmove_cost->GP2GP * 2;
7376       else if (from == GENERAL_REGS)
7377         return regmove_cost->GP2FP * 2;
7378       else if (to == GENERAL_REGS)
7379         return regmove_cost->FP2GP * 2;
7380
7381       /* When AdvSIMD instructions are disabled it is not possible to move
7382          a 128-bit value directly between Q registers.  This is handled in
7383          secondary reload.  A general register is used as a scratch to move
7384          the upper DI value and the lower DI value is moved directly,
7385          hence the cost is the sum of three moves. */
7386       if (! TARGET_SIMD)
7387         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7388
7389       return regmove_cost->FP2FP;
7390     }
7391
7392   if (from == GENERAL_REGS && to == GENERAL_REGS)
7393     return regmove_cost->GP2GP;
7394   else if (from == GENERAL_REGS)
7395     return regmove_cost->GP2FP;
7396   else if (to == GENERAL_REGS)
7397     return regmove_cost->FP2GP;
7398
7399   return regmove_cost->FP2FP;
7400 }
7401
7402 static int
7403 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7404                           reg_class_t rclass ATTRIBUTE_UNUSED,
7405                           bool in ATTRIBUTE_UNUSED)
7406 {
7407   return aarch64_tune_params.memmov_cost;
7408 }
7409
7410 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7411    to optimize 1.0/sqrt.  */
7412
7413 static bool
7414 use_rsqrt_p (machine_mode mode)
7415 {
7416   return (!flag_trapping_math
7417           && flag_unsafe_math_optimizations
7418           && ((aarch64_tune_params.approx_modes->recip_sqrt
7419                & AARCH64_APPROX_MODE (mode))
7420               || flag_mrecip_low_precision_sqrt));
7421 }
7422
7423 /* Function to decide when to use the approximate reciprocal square root
7424    builtin.  */
7425
7426 static tree
7427 aarch64_builtin_reciprocal (tree fndecl)
7428 {
7429   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7430
7431   if (!use_rsqrt_p (mode))
7432     return NULL_TREE;
7433   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7434 }
7435
7436 typedef rtx (*rsqrte_type) (rtx, rtx);
7437
7438 /* Select reciprocal square root initial estimate insn depending on machine
7439    mode.  */
7440
7441 static rsqrte_type
7442 get_rsqrte_type (machine_mode mode)
7443 {
7444   switch (mode)
7445   {
7446     case DFmode:   return gen_aarch64_rsqrtedf;
7447     case SFmode:   return gen_aarch64_rsqrtesf;
7448     case V2DFmode: return gen_aarch64_rsqrtev2df;
7449     case V2SFmode: return gen_aarch64_rsqrtev2sf;
7450     case V4SFmode: return gen_aarch64_rsqrtev4sf;
7451     default: gcc_unreachable ();
7452   }
7453 }
7454
7455 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7456
7457 /* Select reciprocal square root series step insn depending on machine mode.  */
7458
7459 static rsqrts_type
7460 get_rsqrts_type (machine_mode mode)
7461 {
7462   switch (mode)
7463   {
7464     case DFmode:   return gen_aarch64_rsqrtsdf;
7465     case SFmode:   return gen_aarch64_rsqrtssf;
7466     case V2DFmode: return gen_aarch64_rsqrtsv2df;
7467     case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7468     case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7469     default: gcc_unreachable ();
7470   }
7471 }
7472
7473 /* Emit instruction sequence to compute either the approximate square root
7474    or its approximate reciprocal, depending on the flag RECP, and return
7475    whether the sequence was emitted or not.  */
7476
7477 bool
7478 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7479 {
7480   machine_mode mode = GET_MODE (dst);
7481
7482   if (GET_MODE_INNER (mode) == HFmode)
7483     return false;
7484
7485   machine_mode mmsk = mode_for_vector
7486                         (int_mode_for_mode (GET_MODE_INNER (mode)),
7487                          GET_MODE_NUNITS (mode));
7488   bool use_approx_sqrt_p = (!recp
7489                             && (flag_mlow_precision_sqrt
7490                                 || (aarch64_tune_params.approx_modes->sqrt
7491                                     & AARCH64_APPROX_MODE (mode))));
7492   bool use_approx_rsqrt_p = (recp
7493                              && (flag_mrecip_low_precision_sqrt
7494                                  || (aarch64_tune_params.approx_modes->recip_sqrt
7495                                      & AARCH64_APPROX_MODE (mode))));
7496
7497   if (!flag_finite_math_only
7498       || flag_trapping_math
7499       || !flag_unsafe_math_optimizations
7500       || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7501       || optimize_function_for_size_p (cfun))
7502     return false;
7503
7504   rtx xmsk = gen_reg_rtx (mmsk);
7505   if (!recp)
7506     /* When calculating the approximate square root, compare the argument with
7507        0.0 and create a mask.  */
7508     emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7509                                                           CONST0_RTX (mode)))));
7510
7511   /* Estimate the approximate reciprocal square root.  */
7512   rtx xdst = gen_reg_rtx (mode);
7513   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7514
7515   /* Iterate over the series twice for SF and thrice for DF.  */
7516   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7517
7518   /* Optionally iterate over the series once less for faster performance
7519      while sacrificing the accuracy.  */
7520   if ((recp && flag_mrecip_low_precision_sqrt)
7521       || (!recp && flag_mlow_precision_sqrt))
7522     iterations--;
7523
7524   /* Iterate over the series to calculate the approximate reciprocal square
7525      root.  */
7526   rtx x1 = gen_reg_rtx (mode);
7527   while (iterations--)
7528     {
7529       rtx x2 = gen_reg_rtx (mode);
7530       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7531
7532       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7533
7534       if (iterations > 0)
7535         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7536     }
7537
7538   if (!recp)
7539     {
7540       /* Qualify the approximate reciprocal square root when the argument is
7541          0.0 by squashing the intermediary result to 0.0.  */
7542       rtx xtmp = gen_reg_rtx (mmsk);
7543       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7544                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
7545       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7546
7547       /* Calculate the approximate square root.  */
7548       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7549     }
7550
7551   /* Finalize the approximation.  */
7552   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
7553
7554   return true;
7555 }
7556
7557 typedef rtx (*recpe_type) (rtx, rtx);
7558
7559 /* Select reciprocal initial estimate insn depending on machine mode.  */
7560
7561 static recpe_type
7562 get_recpe_type (machine_mode mode)
7563 {
7564   switch (mode)
7565   {
7566     case SFmode:   return (gen_aarch64_frecpesf);
7567     case V2SFmode: return (gen_aarch64_frecpev2sf);
7568     case V4SFmode: return (gen_aarch64_frecpev4sf);
7569     case DFmode:   return (gen_aarch64_frecpedf);
7570     case V2DFmode: return (gen_aarch64_frecpev2df);
7571     default:       gcc_unreachable ();
7572   }
7573 }
7574
7575 typedef rtx (*recps_type) (rtx, rtx, rtx);
7576
7577 /* Select reciprocal series step insn depending on machine mode.  */
7578
7579 static recps_type
7580 get_recps_type (machine_mode mode)
7581 {
7582   switch (mode)
7583   {
7584     case SFmode:   return (gen_aarch64_frecpssf);
7585     case V2SFmode: return (gen_aarch64_frecpsv2sf);
7586     case V4SFmode: return (gen_aarch64_frecpsv4sf);
7587     case DFmode:   return (gen_aarch64_frecpsdf);
7588     case V2DFmode: return (gen_aarch64_frecpsv2df);
7589     default:       gcc_unreachable ();
7590   }
7591 }
7592
7593 /* Emit the instruction sequence to compute the approximation for the division
7594    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
7595
7596 bool
7597 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
7598 {
7599   machine_mode mode = GET_MODE (quo);
7600
7601   if (GET_MODE_INNER (mode) == HFmode)
7602     return false;
7603
7604   bool use_approx_division_p = (flag_mlow_precision_div
7605                                 || (aarch64_tune_params.approx_modes->division
7606                                     & AARCH64_APPROX_MODE (mode)));
7607
7608   if (!flag_finite_math_only
7609       || flag_trapping_math
7610       || !flag_unsafe_math_optimizations
7611       || optimize_function_for_size_p (cfun)
7612       || !use_approx_division_p)
7613     return false;
7614
7615   /* Estimate the approximate reciprocal.  */
7616   rtx xrcp = gen_reg_rtx (mode);
7617   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
7618
7619   /* Iterate over the series twice for SF and thrice for DF.  */
7620   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7621
7622   /* Optionally iterate over the series once less for faster performance,
7623      while sacrificing the accuracy.  */
7624   if (flag_mlow_precision_div)
7625     iterations--;
7626
7627   /* Iterate over the series to calculate the approximate reciprocal.  */
7628   rtx xtmp = gen_reg_rtx (mode);
7629   while (iterations--)
7630     {
7631       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
7632
7633       if (iterations > 0)
7634         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
7635     }
7636
7637   if (num != CONST1_RTX (mode))
7638     {
7639       /* As the approximate reciprocal of DEN is already calculated, only
7640          calculate the approximate division when NUM is not 1.0.  */
7641       rtx xnum = force_reg (mode, num);
7642       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
7643     }
7644
7645   /* Finalize the approximation.  */
7646   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
7647   return true;
7648 }
7649
7650 /* Return the number of instructions that can be issued per cycle.  */
7651 static int
7652 aarch64_sched_issue_rate (void)
7653 {
7654   return aarch64_tune_params.issue_rate;
7655 }
7656
7657 static int
7658 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7659 {
7660   int issue_rate = aarch64_sched_issue_rate ();
7661
7662   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7663 }
7664
7665
7666 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7667    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
7668    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
7669
7670 static int
7671 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
7672                                                     int ready_index)
7673 {
7674   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
7675 }
7676
7677
7678 /* Vectorizer cost model target hooks.  */
7679
7680 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
7681 static int
7682 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7683                                     tree vectype,
7684                                     int misalign ATTRIBUTE_UNUSED)
7685 {
7686   unsigned elements;
7687
7688   switch (type_of_cost)
7689     {
7690       case scalar_stmt:
7691         return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7692
7693       case scalar_load:
7694         return aarch64_tune_params.vec_costs->scalar_load_cost;
7695
7696       case scalar_store:
7697         return aarch64_tune_params.vec_costs->scalar_store_cost;
7698
7699       case vector_stmt:
7700         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7701
7702       case vector_load:
7703         return aarch64_tune_params.vec_costs->vec_align_load_cost;
7704
7705       case vector_store:
7706         return aarch64_tune_params.vec_costs->vec_store_cost;
7707
7708       case vec_to_scalar:
7709         return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7710
7711       case scalar_to_vec:
7712         return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7713
7714       case unaligned_load:
7715         return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7716
7717       case unaligned_store:
7718         return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7719
7720       case cond_branch_taken:
7721         return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7722
7723       case cond_branch_not_taken:
7724         return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7725
7726       case vec_perm:
7727         return aarch64_tune_params.vec_costs->vec_permute_cost;
7728
7729       case vec_promote_demote:
7730         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7731
7732       case vec_construct:
7733         elements = TYPE_VECTOR_SUBPARTS (vectype);
7734         return elements / 2 + 1;
7735
7736       default:
7737         gcc_unreachable ();
7738     }
7739 }
7740
7741 /* Implement targetm.vectorize.add_stmt_cost.  */
7742 static unsigned
7743 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7744                        struct _stmt_vec_info *stmt_info, int misalign,
7745                        enum vect_cost_model_location where)
7746 {
7747   unsigned *cost = (unsigned *) data;
7748   unsigned retval = 0;
7749
7750   if (flag_vect_cost_model)
7751     {
7752       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7753       int stmt_cost =
7754             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7755
7756       /* Statements in an inner loop relative to the loop being
7757          vectorized are weighted more heavily.  The value here is
7758          arbitrary and could potentially be improved with analysis.  */
7759       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7760         count *= 50; /*  FIXME  */
7761
7762       retval = (unsigned) (count * stmt_cost);
7763       cost[where] += retval;
7764     }
7765
7766   return retval;
7767 }
7768
7769 static void initialize_aarch64_code_model (struct gcc_options *);
7770
7771 /* Parse the TO_PARSE string and put the architecture struct that it
7772    selects into RES and the architectural features into ISA_FLAGS.
7773    Return an aarch64_parse_opt_result describing the parse result.
7774    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
7775
7776 static enum aarch64_parse_opt_result
7777 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7778                     unsigned long *isa_flags)
7779 {
7780   char *ext;
7781   const struct processor *arch;
7782   char *str = (char *) alloca (strlen (to_parse) + 1);
7783   size_t len;
7784
7785   strcpy (str, to_parse);
7786
7787   ext = strchr (str, '+');
7788
7789   if (ext != NULL)
7790     len = ext - str;
7791   else
7792     len = strlen (str);
7793
7794   if (len == 0)
7795     return AARCH64_PARSE_MISSING_ARG;
7796
7797
7798   /* Loop through the list of supported ARCHes to find a match.  */
7799   for (arch = all_architectures; arch->name != NULL; arch++)
7800     {
7801       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7802         {
7803           unsigned long isa_temp = arch->flags;
7804
7805           if (ext != NULL)
7806             {
7807               /* TO_PARSE string contains at least one extension.  */
7808               enum aarch64_parse_opt_result ext_res
7809                 = aarch64_parse_extension (ext, &isa_temp);
7810
7811               if (ext_res != AARCH64_PARSE_OK)
7812                 return ext_res;
7813             }
7814           /* Extension parsing was successful.  Confirm the result
7815              arch and ISA flags.  */
7816           *res = arch;
7817           *isa_flags = isa_temp;
7818           return AARCH64_PARSE_OK;
7819         }
7820     }
7821
7822   /* ARCH name not found in list.  */
7823   return AARCH64_PARSE_INVALID_ARG;
7824 }
7825
7826 /* Parse the TO_PARSE string and put the result tuning in RES and the
7827    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
7828    describing the parse result.  If there is an error parsing, RES and
7829    ISA_FLAGS are left unchanged.  */
7830
7831 static enum aarch64_parse_opt_result
7832 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7833                    unsigned long *isa_flags)
7834 {
7835   char *ext;
7836   const struct processor *cpu;
7837   char *str = (char *) alloca (strlen (to_parse) + 1);
7838   size_t len;
7839
7840   strcpy (str, to_parse);
7841
7842   ext = strchr (str, '+');
7843
7844   if (ext != NULL)
7845     len = ext - str;
7846   else
7847     len = strlen (str);
7848
7849   if (len == 0)
7850     return AARCH64_PARSE_MISSING_ARG;
7851
7852
7853   /* Loop through the list of supported CPUs to find a match.  */
7854   for (cpu = all_cores; cpu->name != NULL; cpu++)
7855     {
7856       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7857         {
7858           unsigned long isa_temp = cpu->flags;
7859
7860
7861           if (ext != NULL)
7862             {
7863               /* TO_PARSE string contains at least one extension.  */
7864               enum aarch64_parse_opt_result ext_res
7865                 = aarch64_parse_extension (ext, &isa_temp);
7866
7867               if (ext_res != AARCH64_PARSE_OK)
7868                 return ext_res;
7869             }
7870           /* Extension parsing was successfull.  Confirm the result
7871              cpu and ISA flags.  */
7872           *res = cpu;
7873           *isa_flags = isa_temp;
7874           return AARCH64_PARSE_OK;
7875         }
7876     }
7877
7878   /* CPU name not found in list.  */
7879   return AARCH64_PARSE_INVALID_ARG;
7880 }
7881
7882 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7883    Return an aarch64_parse_opt_result describing the parse result.
7884    If the parsing fails the RES does not change.  */
7885
7886 static enum aarch64_parse_opt_result
7887 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7888 {
7889   const struct processor *cpu;
7890   char *str = (char *) alloca (strlen (to_parse) + 1);
7891
7892   strcpy (str, to_parse);
7893
7894   /* Loop through the list of supported CPUs to find a match.  */
7895   for (cpu = all_cores; cpu->name != NULL; cpu++)
7896     {
7897       if (strcmp (cpu->name, str) == 0)
7898         {
7899           *res = cpu;
7900           return AARCH64_PARSE_OK;
7901         }
7902     }
7903
7904   /* CPU name not found in list.  */
7905   return AARCH64_PARSE_INVALID_ARG;
7906 }
7907
7908 /* Parse TOKEN, which has length LENGTH to see if it is an option
7909    described in FLAG.  If it is, return the index bit for that fusion type.
7910    If not, error (printing OPTION_NAME) and return zero.  */
7911
7912 static unsigned int
7913 aarch64_parse_one_option_token (const char *token,
7914                                 size_t length,
7915                                 const struct aarch64_flag_desc *flag,
7916                                 const char *option_name)
7917 {
7918   for (; flag->name != NULL; flag++)
7919     {
7920       if (length == strlen (flag->name)
7921           && !strncmp (flag->name, token, length))
7922         return flag->flag;
7923     }
7924
7925   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7926   return 0;
7927 }
7928
7929 /* Parse OPTION which is a comma-separated list of flags to enable.
7930    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7931    default state we inherit from the CPU tuning structures.  OPTION_NAME
7932    gives the top-level option we are parsing in the -moverride string,
7933    for use in error messages.  */
7934
7935 static unsigned int
7936 aarch64_parse_boolean_options (const char *option,
7937                                const struct aarch64_flag_desc *flags,
7938                                unsigned int initial_state,
7939                                const char *option_name)
7940 {
7941   const char separator = '.';
7942   const char* specs = option;
7943   const char* ntoken = option;
7944   unsigned int found_flags = initial_state;
7945
7946   while ((ntoken = strchr (specs, separator)))
7947     {
7948       size_t token_length = ntoken - specs;
7949       unsigned token_ops = aarch64_parse_one_option_token (specs,
7950                                                            token_length,
7951                                                            flags,
7952                                                            option_name);
7953       /* If we find "none" (or, for simplicity's sake, an error) anywhere
7954          in the token stream, reset the supported operations.  So:
7955
7956            adrp+add.cmp+branch.none.adrp+add
7957
7958            would have the result of turning on only adrp+add fusion.  */
7959       if (!token_ops)
7960         found_flags = 0;
7961
7962       found_flags |= token_ops;
7963       specs = ++ntoken;
7964     }
7965
7966   /* We ended with a comma, print something.  */
7967   if (!(*specs))
7968     {
7969       error ("%s string ill-formed\n", option_name);
7970       return 0;
7971     }
7972
7973   /* We still have one more token to parse.  */
7974   size_t token_length = strlen (specs);
7975   unsigned token_ops = aarch64_parse_one_option_token (specs,
7976                                                        token_length,
7977                                                        flags,
7978                                                        option_name);
7979    if (!token_ops)
7980      found_flags = 0;
7981
7982   found_flags |= token_ops;
7983   return found_flags;
7984 }
7985
7986 /* Support for overriding instruction fusion.  */
7987
7988 static void
7989 aarch64_parse_fuse_string (const char *fuse_string,
7990                             struct tune_params *tune)
7991 {
7992   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
7993                                                      aarch64_fusible_pairs,
7994                                                      tune->fusible_ops,
7995                                                      "fuse=");
7996 }
7997
7998 /* Support for overriding other tuning flags.  */
7999
8000 static void
8001 aarch64_parse_tune_string (const char *tune_string,
8002                             struct tune_params *tune)
8003 {
8004   tune->extra_tuning_flags
8005     = aarch64_parse_boolean_options (tune_string,
8006                                      aarch64_tuning_flags,
8007                                      tune->extra_tuning_flags,
8008                                      "tune=");
8009 }
8010
8011 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8012    we understand.  If it is, extract the option string and handoff to
8013    the appropriate function.  */
8014
8015 void
8016 aarch64_parse_one_override_token (const char* token,
8017                                   size_t length,
8018                                   struct tune_params *tune)
8019 {
8020   const struct aarch64_tuning_override_function *fn
8021     = aarch64_tuning_override_functions;
8022
8023   const char *option_part = strchr (token, '=');
8024   if (!option_part)
8025     {
8026       error ("tuning string missing in option (%s)", token);
8027       return;
8028     }
8029
8030   /* Get the length of the option name.  */
8031   length = option_part - token;
8032   /* Skip the '=' to get to the option string.  */
8033   option_part++;
8034
8035   for (; fn->name != NULL; fn++)
8036     {
8037       if (!strncmp (fn->name, token, length))
8038         {
8039           fn->parse_override (option_part, tune);
8040           return;
8041         }
8042     }
8043
8044   error ("unknown tuning option (%s)",token);
8045   return;
8046 }
8047
8048 /* A checking mechanism for the implementation of the tls size.  */
8049
8050 static void
8051 initialize_aarch64_tls_size (struct gcc_options *opts)
8052 {
8053   if (aarch64_tls_size == 0)
8054     aarch64_tls_size = 24;
8055
8056   switch (opts->x_aarch64_cmodel_var)
8057     {
8058     case AARCH64_CMODEL_TINY:
8059       /* Both the default and maximum TLS size allowed under tiny is 1M which
8060          needs two instructions to address, so we clamp the size to 24.  */
8061       if (aarch64_tls_size > 24)
8062         aarch64_tls_size = 24;
8063       break;
8064     case AARCH64_CMODEL_SMALL:
8065       /* The maximum TLS size allowed under small is 4G.  */
8066       if (aarch64_tls_size > 32)
8067         aarch64_tls_size = 32;
8068       break;
8069     case AARCH64_CMODEL_LARGE:
8070       /* The maximum TLS size allowed under large is 16E.
8071          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8072       if (aarch64_tls_size > 48)
8073         aarch64_tls_size = 48;
8074       break;
8075     default:
8076       gcc_unreachable ();
8077     }
8078
8079   return;
8080 }
8081
8082 /* Parse STRING looking for options in the format:
8083      string     :: option:string
8084      option     :: name=substring
8085      name       :: {a-z}
8086      substring  :: defined by option.  */
8087
8088 static void
8089 aarch64_parse_override_string (const char* input_string,
8090                                struct tune_params* tune)
8091 {
8092   const char separator = ':';
8093   size_t string_length = strlen (input_string) + 1;
8094   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8095   char *string = string_root;
8096   strncpy (string, input_string, string_length);
8097   string[string_length - 1] = '\0';
8098
8099   char* ntoken = string;
8100
8101   while ((ntoken = strchr (string, separator)))
8102     {
8103       size_t token_length = ntoken - string;
8104       /* Make this substring look like a string.  */
8105       *ntoken = '\0';
8106       aarch64_parse_one_override_token (string, token_length, tune);
8107       string = ++ntoken;
8108     }
8109
8110   /* One last option to parse.  */
8111   aarch64_parse_one_override_token (string, strlen (string), tune);
8112   free (string_root);
8113 }
8114
8115
8116 static void
8117 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8118 {
8119   /* The logic here is that if we are disabling all frame pointer generation
8120      then we do not need to disable leaf frame pointer generation as a
8121      separate operation.  But if we are *only* disabling leaf frame pointer
8122      generation then we set flag_omit_frame_pointer to true, but in
8123      aarch64_frame_pointer_required we return false only for leaf functions.
8124
8125      PR 70044: We have to be careful about being called multiple times for the
8126      same function.  Once we have decided to set flag_omit_frame_pointer just
8127      so that we can omit leaf frame pointers, we must then not interpret a
8128      second call as meaning that all frame pointer generation should be
8129      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8130      non-zero value.  */
8131   if (opts->x_flag_omit_frame_pointer == 2)
8132     opts->x_flag_omit_frame_pointer = 0;
8133
8134   if (opts->x_flag_omit_frame_pointer)
8135     opts->x_flag_omit_leaf_frame_pointer = false;
8136   else if (opts->x_flag_omit_leaf_frame_pointer)
8137     opts->x_flag_omit_frame_pointer = 2;
8138
8139   /* If not optimizing for size, set the default
8140      alignment to what the target wants.  */
8141   if (!opts->x_optimize_size)
8142     {
8143       if (opts->x_align_loops <= 0)
8144         opts->x_align_loops = aarch64_tune_params.loop_align;
8145       if (opts->x_align_jumps <= 0)
8146         opts->x_align_jumps = aarch64_tune_params.jump_align;
8147       if (opts->x_align_functions <= 0)
8148         opts->x_align_functions = aarch64_tune_params.function_align;
8149     }
8150
8151   /* We default to no pc-relative literal loads.  */
8152
8153   aarch64_pcrelative_literal_loads = false;
8154
8155   /* If -mpc-relative-literal-loads is set on the command line, this
8156      implies that the user asked for PC relative literal loads.  */
8157   if (opts->x_pcrelative_literal_loads == 1)
8158     aarch64_pcrelative_literal_loads = true;
8159
8160   /* This is PR70113. When building the Linux kernel with
8161      CONFIG_ARM64_ERRATUM_843419, support for relocations
8162      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8163      removed from the kernel to avoid loading objects with possibly
8164      offending sequences.  Without -mpc-relative-literal-loads we would
8165      generate such relocations, preventing the kernel build from
8166      succeeding.  */
8167   if (opts->x_pcrelative_literal_loads == 2
8168       && TARGET_FIX_ERR_A53_843419)
8169     aarch64_pcrelative_literal_loads = true;
8170
8171   /* In the tiny memory model it makes no sense to disallow PC relative
8172      literal pool loads.  */
8173   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8174       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8175     aarch64_pcrelative_literal_loads = true;
8176
8177   /* When enabling the lower precision Newton series for the square root, also
8178      enable it for the reciprocal square root, since the latter is an
8179      intermediary step for the former.  */
8180   if (flag_mlow_precision_sqrt)
8181     flag_mrecip_low_precision_sqrt = true;
8182 }
8183
8184 /* 'Unpack' up the internal tuning structs and update the options
8185     in OPTS.  The caller must have set up selected_tune and selected_arch
8186     as all the other target-specific codegen decisions are
8187     derived from them.  */
8188
8189 void
8190 aarch64_override_options_internal (struct gcc_options *opts)
8191 {
8192   aarch64_tune_flags = selected_tune->flags;
8193   aarch64_tune = selected_tune->sched_core;
8194   /* Make a copy of the tuning parameters attached to the core, which
8195      we may later overwrite.  */
8196   aarch64_tune_params = *(selected_tune->tune);
8197   aarch64_architecture_version = selected_arch->architecture_version;
8198
8199   if (opts->x_aarch64_override_tune_string)
8200     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8201                                   &aarch64_tune_params);
8202
8203   /* This target defaults to strict volatile bitfields.  */
8204   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8205     opts->x_flag_strict_volatile_bitfields = 1;
8206
8207   initialize_aarch64_code_model (opts);
8208   initialize_aarch64_tls_size (opts);
8209
8210   int queue_depth = 0;
8211   switch (aarch64_tune_params.autoprefetcher_model)
8212     {
8213       case tune_params::AUTOPREFETCHER_OFF:
8214         queue_depth = -1;
8215         break;
8216       case tune_params::AUTOPREFETCHER_WEAK:
8217         queue_depth = 0;
8218         break;
8219       case tune_params::AUTOPREFETCHER_STRONG:
8220         queue_depth = max_insn_queue_index + 1;
8221         break;
8222       default:
8223         gcc_unreachable ();
8224     }
8225
8226   /* We don't mind passing in global_options_set here as we don't use
8227      the *options_set structs anyway.  */
8228   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8229                          queue_depth,
8230                          opts->x_param_values,
8231                          global_options_set.x_param_values);
8232
8233   /* Set the L1 cache line size.  */
8234   if (selected_cpu->tune->cache_line_size != 0)
8235     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8236                            selected_cpu->tune->cache_line_size,
8237                            opts->x_param_values,
8238                            global_options_set.x_param_values);
8239
8240   aarch64_override_options_after_change_1 (opts);
8241 }
8242
8243 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
8244    specified in STR and throw errors if appropriate.  Put the results if
8245    they are valid in RES and ISA_FLAGS.  Return whether the option is
8246    valid.  */
8247
8248 static bool
8249 aarch64_validate_mcpu (const char *str, const struct processor **res,
8250                        unsigned long *isa_flags)
8251 {
8252   enum aarch64_parse_opt_result parse_res
8253     = aarch64_parse_cpu (str, res, isa_flags);
8254
8255   if (parse_res == AARCH64_PARSE_OK)
8256     return true;
8257
8258   switch (parse_res)
8259     {
8260       case AARCH64_PARSE_MISSING_ARG:
8261         error ("missing cpu name in -mcpu=%qs", str);
8262         break;
8263       case AARCH64_PARSE_INVALID_ARG:
8264         error ("unknown value %qs for -mcpu", str);
8265         break;
8266       case AARCH64_PARSE_INVALID_FEATURE:
8267         error ("invalid feature modifier in -mcpu=%qs", str);
8268         break;
8269       default:
8270         gcc_unreachable ();
8271     }
8272
8273   return false;
8274 }
8275
8276 /* Validate a command-line -march option.  Parse the arch and extensions
8277    (if any) specified in STR and throw errors if appropriate.  Put the
8278    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
8279    option is valid.  */
8280
8281 static bool
8282 aarch64_validate_march (const char *str, const struct processor **res,
8283                        unsigned long *isa_flags)
8284 {
8285   enum aarch64_parse_opt_result parse_res
8286     = aarch64_parse_arch (str, res, isa_flags);
8287
8288   if (parse_res == AARCH64_PARSE_OK)
8289     return true;
8290
8291   switch (parse_res)
8292     {
8293       case AARCH64_PARSE_MISSING_ARG:
8294         error ("missing arch name in -march=%qs", str);
8295         break;
8296       case AARCH64_PARSE_INVALID_ARG:
8297         error ("unknown value %qs for -march", str);
8298         break;
8299       case AARCH64_PARSE_INVALID_FEATURE:
8300         error ("invalid feature modifier in -march=%qs", str);
8301         break;
8302       default:
8303         gcc_unreachable ();
8304     }
8305
8306   return false;
8307 }
8308
8309 /* Validate a command-line -mtune option.  Parse the cpu
8310    specified in STR and throw errors if appropriate.  Put the
8311    result, if it is valid, in RES.  Return whether the option is
8312    valid.  */
8313
8314 static bool
8315 aarch64_validate_mtune (const char *str, const struct processor **res)
8316 {
8317   enum aarch64_parse_opt_result parse_res
8318     = aarch64_parse_tune (str, res);
8319
8320   if (parse_res == AARCH64_PARSE_OK)
8321     return true;
8322
8323   switch (parse_res)
8324     {
8325       case AARCH64_PARSE_MISSING_ARG:
8326         error ("missing cpu name in -mtune=%qs", str);
8327         break;
8328       case AARCH64_PARSE_INVALID_ARG:
8329         error ("unknown value %qs for -mtune", str);
8330         break;
8331       default:
8332         gcc_unreachable ();
8333     }
8334   return false;
8335 }
8336
8337 /* Return the CPU corresponding to the enum CPU.
8338    If it doesn't specify a cpu, return the default.  */
8339
8340 static const struct processor *
8341 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8342 {
8343   if (cpu != aarch64_none)
8344     return &all_cores[cpu];
8345
8346   /* The & 0x3f is to extract the bottom 6 bits that encode the
8347      default cpu as selected by the --with-cpu GCC configure option
8348      in config.gcc.
8349      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8350      flags mechanism should be reworked to make it more sane.  */
8351   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8352 }
8353
8354 /* Return the architecture corresponding to the enum ARCH.
8355    If it doesn't specify a valid architecture, return the default.  */
8356
8357 static const struct processor *
8358 aarch64_get_arch (enum aarch64_arch arch)
8359 {
8360   if (arch != aarch64_no_arch)
8361     return &all_architectures[arch];
8362
8363   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8364
8365   return &all_architectures[cpu->arch];
8366 }
8367
8368 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
8369    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8370    tuning structs.  In particular it must set selected_tune and
8371    aarch64_isa_flags that define the available ISA features and tuning
8372    decisions.  It must also set selected_arch as this will be used to
8373    output the .arch asm tags for each function.  */
8374
8375 static void
8376 aarch64_override_options (void)
8377 {
8378   unsigned long cpu_isa = 0;
8379   unsigned long arch_isa = 0;
8380   aarch64_isa_flags = 0;
8381
8382   bool valid_cpu = true;
8383   bool valid_tune = true;
8384   bool valid_arch = true;
8385
8386   selected_cpu = NULL;
8387   selected_arch = NULL;
8388   selected_tune = NULL;
8389
8390   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8391      If either of -march or -mtune is given, they override their
8392      respective component of -mcpu.  */
8393   if (aarch64_cpu_string)
8394     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8395                                         &cpu_isa);
8396
8397   if (aarch64_arch_string)
8398     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8399                                           &arch_isa);
8400
8401   if (aarch64_tune_string)
8402     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8403
8404   /* If the user did not specify a processor, choose the default
8405      one for them.  This will be the CPU set during configuration using
8406      --with-cpu, otherwise it is "generic".  */
8407   if (!selected_cpu)
8408     {
8409       if (selected_arch)
8410         {
8411           selected_cpu = &all_cores[selected_arch->ident];
8412           aarch64_isa_flags = arch_isa;
8413           explicit_arch = selected_arch->arch;
8414         }
8415       else
8416         {
8417           /* Get default configure-time CPU.  */
8418           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8419           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8420         }
8421
8422       if (selected_tune)
8423         explicit_tune_core = selected_tune->ident;
8424     }
8425   /* If both -mcpu and -march are specified check that they are architecturally
8426      compatible, warn if they're not and prefer the -march ISA flags.  */
8427   else if (selected_arch)
8428     {
8429       if (selected_arch->arch != selected_cpu->arch)
8430         {
8431           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8432                        all_architectures[selected_cpu->arch].name,
8433                        selected_arch->name);
8434         }
8435       aarch64_isa_flags = arch_isa;
8436       explicit_arch = selected_arch->arch;
8437       explicit_tune_core = selected_tune ? selected_tune->ident
8438                                           : selected_cpu->ident;
8439     }
8440   else
8441     {
8442       /* -mcpu but no -march.  */
8443       aarch64_isa_flags = cpu_isa;
8444       explicit_tune_core = selected_tune ? selected_tune->ident
8445                                           : selected_cpu->ident;
8446       gcc_assert (selected_cpu);
8447       selected_arch = &all_architectures[selected_cpu->arch];
8448       explicit_arch = selected_arch->arch;
8449     }
8450
8451   /* Set the arch as well as we will need it when outputing
8452      the .arch directive in assembly.  */
8453   if (!selected_arch)
8454     {
8455       gcc_assert (selected_cpu);
8456       selected_arch = &all_architectures[selected_cpu->arch];
8457     }
8458
8459   if (!selected_tune)
8460     selected_tune = selected_cpu;
8461
8462 #ifndef HAVE_AS_MABI_OPTION
8463   /* The compiler may have been configured with 2.23.* binutils, which does
8464      not have support for ILP32.  */
8465   if (TARGET_ILP32)
8466     error ("Assembler does not support -mabi=ilp32");
8467 #endif
8468
8469   /* Make sure we properly set up the explicit options.  */
8470   if ((aarch64_cpu_string && valid_cpu)
8471        || (aarch64_tune_string && valid_tune))
8472     gcc_assert (explicit_tune_core != aarch64_none);
8473
8474   if ((aarch64_cpu_string && valid_cpu)
8475        || (aarch64_arch_string && valid_arch))
8476     gcc_assert (explicit_arch != aarch64_no_arch);
8477
8478   aarch64_override_options_internal (&global_options);
8479
8480   /* Save these options as the default ones in case we push and pop them later
8481      while processing functions with potential target attributes.  */
8482   target_option_default_node = target_option_current_node
8483       = build_target_option_node (&global_options);
8484
8485   aarch64_register_fma_steering ();
8486
8487 }
8488
8489 /* Implement targetm.override_options_after_change.  */
8490
8491 static void
8492 aarch64_override_options_after_change (void)
8493 {
8494   aarch64_override_options_after_change_1 (&global_options);
8495 }
8496
8497 static struct machine_function *
8498 aarch64_init_machine_status (void)
8499 {
8500   struct machine_function *machine;
8501   machine = ggc_cleared_alloc<machine_function> ();
8502   return machine;
8503 }
8504
8505 void
8506 aarch64_init_expanders (void)
8507 {
8508   init_machine_status = aarch64_init_machine_status;
8509 }
8510
8511 /* A checking mechanism for the implementation of the various code models.  */
8512 static void
8513 initialize_aarch64_code_model (struct gcc_options *opts)
8514 {
8515    if (opts->x_flag_pic)
8516      {
8517        switch (opts->x_aarch64_cmodel_var)
8518          {
8519          case AARCH64_CMODEL_TINY:
8520            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8521            break;
8522          case AARCH64_CMODEL_SMALL:
8523 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8524            aarch64_cmodel = (flag_pic == 2
8525                              ? AARCH64_CMODEL_SMALL_PIC
8526                              : AARCH64_CMODEL_SMALL_SPIC);
8527 #else
8528            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8529 #endif
8530            break;
8531          case AARCH64_CMODEL_LARGE:
8532            sorry ("code model %qs with -f%s", "large",
8533                   opts->x_flag_pic > 1 ? "PIC" : "pic");
8534            break;
8535          default:
8536            gcc_unreachable ();
8537          }
8538      }
8539    else
8540      aarch64_cmodel = opts->x_aarch64_cmodel_var;
8541 }
8542
8543 /* Implement TARGET_OPTION_SAVE.  */
8544
8545 static void
8546 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8547 {
8548   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8549 }
8550
8551 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
8552    using the information saved in PTR.  */
8553
8554 static void
8555 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8556 {
8557   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8558   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8559   opts->x_explicit_arch = ptr->x_explicit_arch;
8560   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8561   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8562
8563   aarch64_override_options_internal (opts);
8564 }
8565
8566 /* Implement TARGET_OPTION_PRINT.  */
8567
8568 static void
8569 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8570 {
8571   const struct processor *cpu
8572     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8573   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8574   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8575   std::string extension
8576     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
8577
8578   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8579   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8580            arch->name, extension.c_str ());
8581 }
8582
8583 static GTY(()) tree aarch64_previous_fndecl;
8584
8585 void
8586 aarch64_reset_previous_fndecl (void)
8587 {
8588   aarch64_previous_fndecl = NULL;
8589 }
8590
8591 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
8592    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
8593    make sure optab availability predicates are recomputed when necessary.  */
8594
8595 void
8596 aarch64_save_restore_target_globals (tree new_tree)
8597 {
8598   if (TREE_TARGET_GLOBALS (new_tree))
8599     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8600   else if (new_tree == target_option_default_node)
8601     restore_target_globals (&default_target_globals);
8602   else
8603     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
8604 }
8605
8606 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
8607    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8608    of the function, if such exists.  This function may be called multiple
8609    times on a single function so use aarch64_previous_fndecl to avoid
8610    setting up identical state.  */
8611
8612 static void
8613 aarch64_set_current_function (tree fndecl)
8614 {
8615   if (!fndecl || fndecl == aarch64_previous_fndecl)
8616     return;
8617
8618   tree old_tree = (aarch64_previous_fndecl
8619                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
8620                    : NULL_TREE);
8621
8622   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
8623
8624   /* If current function has no attributes but the previous one did,
8625      use the default node.  */
8626   if (!new_tree && old_tree)
8627     new_tree = target_option_default_node;
8628
8629   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
8630      the default have been handled by aarch64_save_restore_target_globals from
8631      aarch64_pragma_target_parse.  */
8632   if (old_tree == new_tree)
8633     return;
8634
8635   aarch64_previous_fndecl = fndecl;
8636
8637   /* First set the target options.  */
8638   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
8639
8640   aarch64_save_restore_target_globals (new_tree);
8641 }
8642
8643 /* Enum describing the various ways we can handle attributes.
8644    In many cases we can reuse the generic option handling machinery.  */
8645
8646 enum aarch64_attr_opt_type
8647 {
8648   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
8649   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
8650   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
8651   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
8652 };
8653
8654 /* All the information needed to handle a target attribute.
8655    NAME is the name of the attribute.
8656    ATTR_TYPE specifies the type of behavior of the attribute as described
8657    in the definition of enum aarch64_attr_opt_type.
8658    ALLOW_NEG is true if the attribute supports a "no-" form.
8659    HANDLER is the function that takes the attribute string and whether
8660    it is a pragma or attribute and handles the option.  It is needed only
8661    when the ATTR_TYPE is aarch64_attr_custom.
8662    OPT_NUM is the enum specifying the option that the attribute modifies.
8663    This is needed for attributes that mirror the behavior of a command-line
8664    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8665    aarch64_attr_enum.  */
8666
8667 struct aarch64_attribute_info
8668 {
8669   const char *name;
8670   enum aarch64_attr_opt_type attr_type;
8671   bool allow_neg;
8672   bool (*handler) (const char *, const char *);
8673   enum opt_code opt_num;
8674 };
8675
8676 /* Handle the ARCH_STR argument to the arch= target attribute.
8677    PRAGMA_OR_ATTR is used in potential error messages.  */
8678
8679 static bool
8680 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8681 {
8682   const struct processor *tmp_arch = NULL;
8683   enum aarch64_parse_opt_result parse_res
8684     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8685
8686   if (parse_res == AARCH64_PARSE_OK)
8687     {
8688       gcc_assert (tmp_arch);
8689       selected_arch = tmp_arch;
8690       explicit_arch = selected_arch->arch;
8691       return true;
8692     }
8693
8694   switch (parse_res)
8695     {
8696       case AARCH64_PARSE_MISSING_ARG:
8697         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8698         break;
8699       case AARCH64_PARSE_INVALID_ARG:
8700         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8701         break;
8702       case AARCH64_PARSE_INVALID_FEATURE:
8703         error ("invalid feature modifier %qs for 'arch' target %s",
8704                str, pragma_or_attr);
8705         break;
8706       default:
8707         gcc_unreachable ();
8708     }
8709
8710   return false;
8711 }
8712
8713 /* Handle the argument CPU_STR to the cpu= target attribute.
8714    PRAGMA_OR_ATTR is used in potential error messages.  */
8715
8716 static bool
8717 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8718 {
8719   const struct processor *tmp_cpu = NULL;
8720   enum aarch64_parse_opt_result parse_res
8721     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8722
8723   if (parse_res == AARCH64_PARSE_OK)
8724     {
8725       gcc_assert (tmp_cpu);
8726       selected_tune = tmp_cpu;
8727       explicit_tune_core = selected_tune->ident;
8728
8729       selected_arch = &all_architectures[tmp_cpu->arch];
8730       explicit_arch = selected_arch->arch;
8731       return true;
8732     }
8733
8734   switch (parse_res)
8735     {
8736       case AARCH64_PARSE_MISSING_ARG:
8737         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8738         break;
8739       case AARCH64_PARSE_INVALID_ARG:
8740         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8741         break;
8742       case AARCH64_PARSE_INVALID_FEATURE:
8743         error ("invalid feature modifier %qs for 'cpu' target %s",
8744                str, pragma_or_attr);
8745         break;
8746       default:
8747         gcc_unreachable ();
8748     }
8749
8750   return false;
8751 }
8752
8753 /* Handle the argument STR to the tune= target attribute.
8754    PRAGMA_OR_ATTR is used in potential error messages.  */
8755
8756 static bool
8757 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8758 {
8759   const struct processor *tmp_tune = NULL;
8760   enum aarch64_parse_opt_result parse_res
8761     = aarch64_parse_tune (str, &tmp_tune);
8762
8763   if (parse_res == AARCH64_PARSE_OK)
8764     {
8765       gcc_assert (tmp_tune);
8766       selected_tune = tmp_tune;
8767       explicit_tune_core = selected_tune->ident;
8768       return true;
8769     }
8770
8771   switch (parse_res)
8772     {
8773       case AARCH64_PARSE_INVALID_ARG:
8774         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8775         break;
8776       default:
8777         gcc_unreachable ();
8778     }
8779
8780   return false;
8781 }
8782
8783 /* Parse an architecture extensions target attribute string specified in STR.
8784    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
8785    if successful.  Update aarch64_isa_flags to reflect the ISA features
8786    modified.
8787    PRAGMA_OR_ATTR is used in potential error messages.  */
8788
8789 static bool
8790 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8791 {
8792   enum aarch64_parse_opt_result parse_res;
8793   unsigned long isa_flags = aarch64_isa_flags;
8794
8795   /* We allow "+nothing" in the beginning to clear out all architectural
8796      features if the user wants to handpick specific features.  */
8797   if (strncmp ("+nothing", str, 8) == 0)
8798     {
8799       isa_flags = 0;
8800       str += 8;
8801     }
8802
8803   parse_res = aarch64_parse_extension (str, &isa_flags);
8804
8805   if (parse_res == AARCH64_PARSE_OK)
8806     {
8807       aarch64_isa_flags = isa_flags;
8808       return true;
8809     }
8810
8811   switch (parse_res)
8812     {
8813       case AARCH64_PARSE_MISSING_ARG:
8814         error ("missing feature modifier in target %s %qs",
8815                pragma_or_attr, str);
8816         break;
8817
8818       case AARCH64_PARSE_INVALID_FEATURE:
8819         error ("invalid feature modifier in target %s %qs",
8820                pragma_or_attr, str);
8821         break;
8822
8823       default:
8824         gcc_unreachable ();
8825     }
8826
8827  return false;
8828 }
8829
8830 /* The target attributes that we support.  On top of these we also support just
8831    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
8832    handled explicitly in aarch64_process_one_target_attr.  */
8833
8834 static const struct aarch64_attribute_info aarch64_attributes[] =
8835 {
8836   { "general-regs-only", aarch64_attr_mask, false, NULL,
8837      OPT_mgeneral_regs_only },
8838   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8839      OPT_mfix_cortex_a53_835769 },
8840   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
8841      OPT_mfix_cortex_a53_843419 },
8842   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8843   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8844   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8845      OPT_momit_leaf_frame_pointer },
8846   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8847   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
8848      OPT_march_ },
8849   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
8850   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
8851      OPT_mtune_ },
8852   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
8853 };
8854
8855 /* Parse ARG_STR which contains the definition of one target attribute.
8856    Show appropriate errors if any or return true if the attribute is valid.
8857    PRAGMA_OR_ATTR holds the string to use in error messages about whether
8858    we're processing a target attribute or pragma.  */
8859
8860 static bool
8861 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
8862 {
8863   bool invert = false;
8864
8865   size_t len = strlen (arg_str);
8866
8867   if (len == 0)
8868     {
8869       error ("malformed target %s", pragma_or_attr);
8870       return false;
8871     }
8872
8873   char *str_to_check = (char *) alloca (len + 1);
8874   strcpy (str_to_check, arg_str);
8875
8876   /* Skip leading whitespace.  */
8877   while (*str_to_check == ' ' || *str_to_check == '\t')
8878     str_to_check++;
8879
8880   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
8881      It is easier to detect and handle it explicitly here rather than going
8882      through the machinery for the rest of the target attributes in this
8883      function.  */
8884   if (*str_to_check == '+')
8885     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
8886
8887   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
8888     {
8889       invert = true;
8890       str_to_check += 3;
8891     }
8892   char *arg = strchr (str_to_check, '=');
8893
8894   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
8895      and point ARG to "foo".  */
8896   if (arg)
8897     {
8898       *arg = '\0';
8899       arg++;
8900     }
8901   const struct aarch64_attribute_info *p_attr;
8902   bool found = false;
8903   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
8904     {
8905       /* If the names don't match up, or the user has given an argument
8906          to an attribute that doesn't accept one, or didn't give an argument
8907          to an attribute that expects one, fail to match.  */
8908       if (strcmp (str_to_check, p_attr->name) != 0)
8909         continue;
8910
8911       found = true;
8912       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
8913                               || p_attr->attr_type == aarch64_attr_enum;
8914
8915       if (attr_need_arg_p ^ (arg != NULL))
8916         {
8917           error ("target %s %qs does not accept an argument",
8918                   pragma_or_attr, str_to_check);
8919           return false;
8920         }
8921
8922       /* If the name matches but the attribute does not allow "no-" versions
8923          then we can't match.  */
8924       if (invert && !p_attr->allow_neg)
8925         {
8926           error ("target %s %qs does not allow a negated form",
8927                   pragma_or_attr, str_to_check);
8928           return false;
8929         }
8930
8931       switch (p_attr->attr_type)
8932         {
8933         /* Has a custom handler registered.
8934            For example, cpu=, arch=, tune=.  */
8935           case aarch64_attr_custom:
8936             gcc_assert (p_attr->handler);
8937             if (!p_attr->handler (arg, pragma_or_attr))
8938               return false;
8939             break;
8940
8941           /* Either set or unset a boolean option.  */
8942           case aarch64_attr_bool:
8943             {
8944               struct cl_decoded_option decoded;
8945
8946               generate_option (p_attr->opt_num, NULL, !invert,
8947                                CL_TARGET, &decoded);
8948               aarch64_handle_option (&global_options, &global_options_set,
8949                                       &decoded, input_location);
8950               break;
8951             }
8952           /* Set or unset a bit in the target_flags.  aarch64_handle_option
8953              should know what mask to apply given the option number.  */
8954           case aarch64_attr_mask:
8955             {
8956               struct cl_decoded_option decoded;
8957               /* We only need to specify the option number.
8958                  aarch64_handle_option will know which mask to apply.  */
8959               decoded.opt_index = p_attr->opt_num;
8960               decoded.value = !invert;
8961               aarch64_handle_option (&global_options, &global_options_set,
8962                                       &decoded, input_location);
8963               break;
8964             }
8965           /* Use the option setting machinery to set an option to an enum.  */
8966           case aarch64_attr_enum:
8967             {
8968               gcc_assert (arg);
8969               bool valid;
8970               int value;
8971               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
8972                                               &value, CL_TARGET);
8973               if (valid)
8974                 {
8975                   set_option (&global_options, NULL, p_attr->opt_num, value,
8976                               NULL, DK_UNSPECIFIED, input_location,
8977                               global_dc);
8978                 }
8979               else
8980                 {
8981                   error ("target %s %s=%s is not valid",
8982                          pragma_or_attr, str_to_check, arg);
8983                 }
8984               break;
8985             }
8986           default:
8987             gcc_unreachable ();
8988         }
8989     }
8990
8991   /* If we reached here we either have found an attribute and validated
8992      it or didn't match any.  If we matched an attribute but its arguments
8993      were malformed we will have returned false already.  */
8994   return found;
8995 }
8996
8997 /* Count how many times the character C appears in
8998    NULL-terminated string STR.  */
8999
9000 static unsigned int
9001 num_occurences_in_str (char c, char *str)
9002 {
9003   unsigned int res = 0;
9004   while (*str != '\0')
9005     {
9006       if (*str == c)
9007         res++;
9008
9009       str++;
9010     }
9011
9012   return res;
9013 }
9014
9015 /* Parse the tree in ARGS that contains the target attribute information
9016    and update the global target options space.  PRAGMA_OR_ATTR is a string
9017    to be used in error messages, specifying whether this is processing
9018    a target attribute or a target pragma.  */
9019
9020 bool
9021 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9022 {
9023   if (TREE_CODE (args) == TREE_LIST)
9024     {
9025       do
9026         {
9027           tree head = TREE_VALUE (args);
9028           if (head)
9029             {
9030               if (!aarch64_process_target_attr (head, pragma_or_attr))
9031                 return false;
9032             }
9033           args = TREE_CHAIN (args);
9034         } while (args);
9035
9036       return true;
9037     }
9038   /* We expect to find a string to parse.  */
9039   gcc_assert (TREE_CODE (args) == STRING_CST);
9040
9041   size_t len = strlen (TREE_STRING_POINTER (args));
9042   char *str_to_check = (char *) alloca (len + 1);
9043   strcpy (str_to_check, TREE_STRING_POINTER (args));
9044
9045   if (len == 0)
9046     {
9047       error ("malformed target %s value", pragma_or_attr);
9048       return false;
9049     }
9050
9051   /* Used to catch empty spaces between commas i.e.
9052      attribute ((target ("attr1,,attr2"))).  */
9053   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9054
9055   /* Handle multiple target attributes separated by ','.  */
9056   char *token = strtok (str_to_check, ",");
9057
9058   unsigned int num_attrs = 0;
9059   while (token)
9060     {
9061       num_attrs++;
9062       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9063         {
9064           error ("target %s %qs is invalid", pragma_or_attr, token);
9065           return false;
9066         }
9067
9068       token = strtok (NULL, ",");
9069     }
9070
9071   if (num_attrs != num_commas + 1)
9072     {
9073       error ("malformed target %s list %qs",
9074               pragma_or_attr, TREE_STRING_POINTER (args));
9075       return false;
9076     }
9077
9078   return true;
9079 }
9080
9081 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9082    process attribute ((target ("..."))).  */
9083
9084 static bool
9085 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9086 {
9087   struct cl_target_option cur_target;
9088   bool ret;
9089   tree old_optimize;
9090   tree new_target, new_optimize;
9091   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9092
9093   /* If what we're processing is the current pragma string then the
9094      target option node is already stored in target_option_current_node
9095      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9096      having to re-parse the string.  This is especially useful to keep
9097      arm_neon.h compile times down since that header contains a lot
9098      of intrinsics enclosed in pragmas.  */
9099   if (!existing_target && args == current_target_pragma)
9100     {
9101       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9102       return true;
9103     }
9104   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9105
9106   old_optimize = build_optimization_node (&global_options);
9107   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9108
9109   /* If the function changed the optimization levels as well as setting
9110      target options, start with the optimizations specified.  */
9111   if (func_optimize && func_optimize != old_optimize)
9112     cl_optimization_restore (&global_options,
9113                              TREE_OPTIMIZATION (func_optimize));
9114
9115   /* Save the current target options to restore at the end.  */
9116   cl_target_option_save (&cur_target, &global_options);
9117
9118   /* If fndecl already has some target attributes applied to it, unpack
9119      them so that we add this attribute on top of them, rather than
9120      overwriting them.  */
9121   if (existing_target)
9122     {
9123       struct cl_target_option *existing_options
9124         = TREE_TARGET_OPTION (existing_target);
9125
9126       if (existing_options)
9127         cl_target_option_restore (&global_options, existing_options);
9128     }
9129   else
9130     cl_target_option_restore (&global_options,
9131                         TREE_TARGET_OPTION (target_option_current_node));
9132
9133
9134   ret = aarch64_process_target_attr (args, "attribute");
9135
9136   /* Set up any additional state.  */
9137   if (ret)
9138     {
9139       aarch64_override_options_internal (&global_options);
9140       /* Initialize SIMD builtins if we haven't already.
9141          Set current_target_pragma to NULL for the duration so that
9142          the builtin initialization code doesn't try to tag the functions
9143          being built with the attributes specified by any current pragma, thus
9144          going into an infinite recursion.  */
9145       if (TARGET_SIMD)
9146         {
9147           tree saved_current_target_pragma = current_target_pragma;
9148           current_target_pragma = NULL;
9149           aarch64_init_simd_builtins ();
9150           current_target_pragma = saved_current_target_pragma;
9151         }
9152       new_target = build_target_option_node (&global_options);
9153     }
9154   else
9155     new_target = NULL;
9156
9157   new_optimize = build_optimization_node (&global_options);
9158
9159   if (fndecl && ret)
9160     {
9161       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9162
9163       if (old_optimize != new_optimize)
9164         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9165     }
9166
9167   cl_target_option_restore (&global_options, &cur_target);
9168
9169   if (old_optimize != new_optimize)
9170     cl_optimization_restore (&global_options,
9171                              TREE_OPTIMIZATION (old_optimize));
9172   return ret;
9173 }
9174
9175 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
9176    tri-bool options (yes, no, don't care) and the default value is
9177    DEF, determine whether to reject inlining.  */
9178
9179 static bool
9180 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9181                                      int dont_care, int def)
9182 {
9183   /* If the callee doesn't care, always allow inlining.  */
9184   if (callee == dont_care)
9185     return true;
9186
9187   /* If the caller doesn't care, always allow inlining.  */
9188   if (caller == dont_care)
9189     return true;
9190
9191   /* Otherwise, allow inlining if either the callee and caller values
9192      agree, or if the callee is using the default value.  */
9193   return (callee == caller || callee == def);
9194 }
9195
9196 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
9197    to inline CALLEE into CALLER based on target-specific info.
9198    Make sure that the caller and callee have compatible architectural
9199    features.  Then go through the other possible target attributes
9200    and see if they can block inlining.  Try not to reject always_inline
9201    callees unless they are incompatible architecturally.  */
9202
9203 static bool
9204 aarch64_can_inline_p (tree caller, tree callee)
9205 {
9206   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9207   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9208
9209   /* If callee has no option attributes, then it is ok to inline.  */
9210   if (!callee_tree)
9211     return true;
9212
9213   struct cl_target_option *caller_opts
9214         = TREE_TARGET_OPTION (caller_tree ? caller_tree
9215                                            : target_option_default_node);
9216
9217   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9218
9219
9220   /* Callee's ISA flags should be a subset of the caller's.  */
9221   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9222        != callee_opts->x_aarch64_isa_flags)
9223     return false;
9224
9225   /* Allow non-strict aligned functions inlining into strict
9226      aligned ones.  */
9227   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9228        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9229       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9230            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9231     return false;
9232
9233   bool always_inline = lookup_attribute ("always_inline",
9234                                           DECL_ATTRIBUTES (callee));
9235
9236   /* If the architectural features match up and the callee is always_inline
9237      then the other attributes don't matter.  */
9238   if (always_inline)
9239     return true;
9240
9241   if (caller_opts->x_aarch64_cmodel_var
9242       != callee_opts->x_aarch64_cmodel_var)
9243     return false;
9244
9245   if (caller_opts->x_aarch64_tls_dialect
9246       != callee_opts->x_aarch64_tls_dialect)
9247     return false;
9248
9249   /* Honour explicit requests to workaround errata.  */
9250   if (!aarch64_tribools_ok_for_inlining_p (
9251           caller_opts->x_aarch64_fix_a53_err835769,
9252           callee_opts->x_aarch64_fix_a53_err835769,
9253           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9254     return false;
9255
9256   if (!aarch64_tribools_ok_for_inlining_p (
9257           caller_opts->x_aarch64_fix_a53_err843419,
9258           callee_opts->x_aarch64_fix_a53_err843419,
9259           2, TARGET_FIX_ERR_A53_843419))
9260     return false;
9261
9262   /* If the user explicitly specified -momit-leaf-frame-pointer for the
9263      caller and calle and they don't match up, reject inlining.  */
9264   if (!aarch64_tribools_ok_for_inlining_p (
9265           caller_opts->x_flag_omit_leaf_frame_pointer,
9266           callee_opts->x_flag_omit_leaf_frame_pointer,
9267           2, 1))
9268     return false;
9269
9270   /* If the callee has specific tuning overrides, respect them.  */
9271   if (callee_opts->x_aarch64_override_tune_string != NULL
9272       && caller_opts->x_aarch64_override_tune_string == NULL)
9273     return false;
9274
9275   /* If the user specified tuning override strings for the
9276      caller and callee and they don't match up, reject inlining.
9277      We just do a string compare here, we don't analyze the meaning
9278      of the string, as it would be too costly for little gain.  */
9279   if (callee_opts->x_aarch64_override_tune_string
9280       && caller_opts->x_aarch64_override_tune_string
9281       && (strcmp (callee_opts->x_aarch64_override_tune_string,
9282                   caller_opts->x_aarch64_override_tune_string) != 0))
9283     return false;
9284
9285   return true;
9286 }
9287
9288 /* Return true if SYMBOL_REF X binds locally.  */
9289
9290 static bool
9291 aarch64_symbol_binds_local_p (const_rtx x)
9292 {
9293   return (SYMBOL_REF_DECL (x)
9294           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9295           : SYMBOL_REF_LOCAL_P (x));
9296 }
9297
9298 /* Return true if SYMBOL_REF X is thread local */
9299 static bool
9300 aarch64_tls_symbol_p (rtx x)
9301 {
9302   if (! TARGET_HAVE_TLS)
9303     return false;
9304
9305   if (GET_CODE (x) != SYMBOL_REF)
9306     return false;
9307
9308   return SYMBOL_REF_TLS_MODEL (x) != 0;
9309 }
9310
9311 /* Classify a TLS symbol into one of the TLS kinds.  */
9312 enum aarch64_symbol_type
9313 aarch64_classify_tls_symbol (rtx x)
9314 {
9315   enum tls_model tls_kind = tls_symbolic_operand_type (x);
9316
9317   switch (tls_kind)
9318     {
9319     case TLS_MODEL_GLOBAL_DYNAMIC:
9320     case TLS_MODEL_LOCAL_DYNAMIC:
9321       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9322
9323     case TLS_MODEL_INITIAL_EXEC:
9324       switch (aarch64_cmodel)
9325         {
9326         case AARCH64_CMODEL_TINY:
9327         case AARCH64_CMODEL_TINY_PIC:
9328           return SYMBOL_TINY_TLSIE;
9329         default:
9330           return SYMBOL_SMALL_TLSIE;
9331         }
9332
9333     case TLS_MODEL_LOCAL_EXEC:
9334       if (aarch64_tls_size == 12)
9335         return SYMBOL_TLSLE12;
9336       else if (aarch64_tls_size == 24)
9337         return SYMBOL_TLSLE24;
9338       else if (aarch64_tls_size == 32)
9339         return SYMBOL_TLSLE32;
9340       else if (aarch64_tls_size == 48)
9341         return SYMBOL_TLSLE48;
9342       else
9343         gcc_unreachable ();
9344
9345     case TLS_MODEL_EMULATED:
9346     case TLS_MODEL_NONE:
9347       return SYMBOL_FORCE_TO_MEM;
9348
9349     default:
9350       gcc_unreachable ();
9351     }
9352 }
9353
9354 /* Return the method that should be used to access SYMBOL_REF or
9355    LABEL_REF X.  */
9356
9357 enum aarch64_symbol_type
9358 aarch64_classify_symbol (rtx x, rtx offset)
9359 {
9360   if (GET_CODE (x) == LABEL_REF)
9361     {
9362       switch (aarch64_cmodel)
9363         {
9364         case AARCH64_CMODEL_LARGE:
9365           return SYMBOL_FORCE_TO_MEM;
9366
9367         case AARCH64_CMODEL_TINY_PIC:
9368         case AARCH64_CMODEL_TINY:
9369           return SYMBOL_TINY_ABSOLUTE;
9370
9371         case AARCH64_CMODEL_SMALL_SPIC:
9372         case AARCH64_CMODEL_SMALL_PIC:
9373         case AARCH64_CMODEL_SMALL:
9374           return SYMBOL_SMALL_ABSOLUTE;
9375
9376         default:
9377           gcc_unreachable ();
9378         }
9379     }
9380
9381   if (GET_CODE (x) == SYMBOL_REF)
9382     {
9383       if (aarch64_tls_symbol_p (x))
9384         return aarch64_classify_tls_symbol (x);
9385
9386       switch (aarch64_cmodel)
9387         {
9388         case AARCH64_CMODEL_TINY:
9389           /* When we retrieve symbol + offset address, we have to make sure
9390              the offset does not cause overflow of the final address.  But
9391              we have no way of knowing the address of symbol at compile time
9392              so we can't accurately say if the distance between the PC and
9393              symbol + offset is outside the addressible range of +/-1M in the
9394              TINY code model.  So we rely on images not being greater than
9395              1M and cap the offset at 1M and anything beyond 1M will have to
9396              be loaded using an alternative mechanism.  Furthermore if the
9397              symbol is a weak reference to something that isn't known to
9398              resolve to a symbol in this module, then force to memory.  */
9399           if ((SYMBOL_REF_WEAK (x)
9400                && !aarch64_symbol_binds_local_p (x))
9401               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9402             return SYMBOL_FORCE_TO_MEM;
9403           return SYMBOL_TINY_ABSOLUTE;
9404
9405         case AARCH64_CMODEL_SMALL:
9406           /* Same reasoning as the tiny code model, but the offset cap here is
9407              4G.  */
9408           if ((SYMBOL_REF_WEAK (x)
9409                && !aarch64_symbol_binds_local_p (x))
9410               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9411                             HOST_WIDE_INT_C (4294967264)))
9412             return SYMBOL_FORCE_TO_MEM;
9413           return SYMBOL_SMALL_ABSOLUTE;
9414
9415         case AARCH64_CMODEL_TINY_PIC:
9416           if (!aarch64_symbol_binds_local_p (x))
9417             return SYMBOL_TINY_GOT;
9418           return SYMBOL_TINY_ABSOLUTE;
9419
9420         case AARCH64_CMODEL_SMALL_SPIC:
9421         case AARCH64_CMODEL_SMALL_PIC:
9422           if (!aarch64_symbol_binds_local_p (x))
9423             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9424                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9425           return SYMBOL_SMALL_ABSOLUTE;
9426
9427         case AARCH64_CMODEL_LARGE:
9428           /* This is alright even in PIC code as the constant
9429              pool reference is always PC relative and within
9430              the same translation unit.  */
9431           if (CONSTANT_POOL_ADDRESS_P (x))
9432             return SYMBOL_SMALL_ABSOLUTE;
9433           else
9434             return SYMBOL_FORCE_TO_MEM;
9435
9436         default:
9437           gcc_unreachable ();
9438         }
9439     }
9440
9441   /* By default push everything into the constant pool.  */
9442   return SYMBOL_FORCE_TO_MEM;
9443 }
9444
9445 bool
9446 aarch64_constant_address_p (rtx x)
9447 {
9448   return (CONSTANT_P (x) && memory_address_p (DImode, x));
9449 }
9450
9451 bool
9452 aarch64_legitimate_pic_operand_p (rtx x)
9453 {
9454   if (GET_CODE (x) == SYMBOL_REF
9455       || (GET_CODE (x) == CONST
9456           && GET_CODE (XEXP (x, 0)) == PLUS
9457           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9458      return false;
9459
9460   return true;
9461 }
9462
9463 /* Return true if X holds either a quarter-precision or
9464      floating-point +0.0 constant.  */
9465 static bool
9466 aarch64_valid_floating_const (machine_mode mode, rtx x)
9467 {
9468   if (!CONST_DOUBLE_P (x))
9469     return false;
9470
9471   if (aarch64_float_const_zero_rtx_p (x))
9472     return true;
9473
9474   /* We only handle moving 0.0 to a TFmode register.  */
9475   if (!(mode == SFmode || mode == DFmode))
9476     return false;
9477
9478   return aarch64_float_const_representable_p (x);
9479 }
9480
9481 static bool
9482 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9483 {
9484   /* Do not allow vector struct mode constants.  We could support
9485      0 and -1 easily, but they need support in aarch64-simd.md.  */
9486   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9487     return false;
9488
9489   /* This could probably go away because
9490      we now decompose CONST_INTs according to expand_mov_immediate.  */
9491   if ((GET_CODE (x) == CONST_VECTOR
9492        && aarch64_simd_valid_immediate (x, mode, false, NULL))
9493       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9494         return !targetm.cannot_force_const_mem (mode, x);
9495
9496   if (GET_CODE (x) == HIGH
9497       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9498     return true;
9499
9500   return aarch64_constant_address_p (x);
9501 }
9502
9503 rtx
9504 aarch64_load_tp (rtx target)
9505 {
9506   if (!target
9507       || GET_MODE (target) != Pmode
9508       || !register_operand (target, Pmode))
9509     target = gen_reg_rtx (Pmode);
9510
9511   /* Can return in any reg.  */
9512   emit_insn (gen_aarch64_load_tp_hard (target));
9513   return target;
9514 }
9515
9516 /* On AAPCS systems, this is the "struct __va_list".  */
9517 static GTY(()) tree va_list_type;
9518
9519 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9520    Return the type to use as __builtin_va_list.
9521
9522    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9523
9524    struct __va_list
9525    {
9526      void *__stack;
9527      void *__gr_top;
9528      void *__vr_top;
9529      int   __gr_offs;
9530      int   __vr_offs;
9531    };  */
9532
9533 static tree
9534 aarch64_build_builtin_va_list (void)
9535 {
9536   tree va_list_name;
9537   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9538
9539   /* Create the type.  */
9540   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9541   /* Give it the required name.  */
9542   va_list_name = build_decl (BUILTINS_LOCATION,
9543                              TYPE_DECL,
9544                              get_identifier ("__va_list"),
9545                              va_list_type);
9546   DECL_ARTIFICIAL (va_list_name) = 1;
9547   TYPE_NAME (va_list_type) = va_list_name;
9548   TYPE_STUB_DECL (va_list_type) = va_list_name;
9549
9550   /* Create the fields.  */
9551   f_stack = build_decl (BUILTINS_LOCATION,
9552                         FIELD_DECL, get_identifier ("__stack"),
9553                         ptr_type_node);
9554   f_grtop = build_decl (BUILTINS_LOCATION,
9555                         FIELD_DECL, get_identifier ("__gr_top"),
9556                         ptr_type_node);
9557   f_vrtop = build_decl (BUILTINS_LOCATION,
9558                         FIELD_DECL, get_identifier ("__vr_top"),
9559                         ptr_type_node);
9560   f_groff = build_decl (BUILTINS_LOCATION,
9561                         FIELD_DECL, get_identifier ("__gr_offs"),
9562                         integer_type_node);
9563   f_vroff = build_decl (BUILTINS_LOCATION,
9564                         FIELD_DECL, get_identifier ("__vr_offs"),
9565                         integer_type_node);
9566
9567   /* Tell tree-stdarg pass about our internal offset fields.
9568      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
9569      purpose to identify whether the code is updating va_list internal
9570      offset fields through irregular way.  */
9571   va_list_gpr_counter_field = f_groff;
9572   va_list_fpr_counter_field = f_vroff;
9573
9574   DECL_ARTIFICIAL (f_stack) = 1;
9575   DECL_ARTIFICIAL (f_grtop) = 1;
9576   DECL_ARTIFICIAL (f_vrtop) = 1;
9577   DECL_ARTIFICIAL (f_groff) = 1;
9578   DECL_ARTIFICIAL (f_vroff) = 1;
9579
9580   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9581   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9582   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9583   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9584   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9585
9586   TYPE_FIELDS (va_list_type) = f_stack;
9587   DECL_CHAIN (f_stack) = f_grtop;
9588   DECL_CHAIN (f_grtop) = f_vrtop;
9589   DECL_CHAIN (f_vrtop) = f_groff;
9590   DECL_CHAIN (f_groff) = f_vroff;
9591
9592   /* Compute its layout.  */
9593   layout_type (va_list_type);
9594
9595   return va_list_type;
9596 }
9597
9598 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
9599 static void
9600 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
9601 {
9602   const CUMULATIVE_ARGS *cum;
9603   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9604   tree stack, grtop, vrtop, groff, vroff;
9605   tree t;
9606   int gr_save_area_size = cfun->va_list_gpr_size;
9607   int vr_save_area_size = cfun->va_list_fpr_size;
9608   int vr_offset;
9609
9610   cum = &crtl->args.info;
9611   if (cfun->va_list_gpr_size)
9612     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
9613                              cfun->va_list_gpr_size);
9614   if (cfun->va_list_fpr_size)
9615     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
9616                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
9617
9618   if (!TARGET_FLOAT)
9619     {
9620       gcc_assert (cum->aapcs_nvrn == 0);
9621       vr_save_area_size = 0;
9622     }
9623
9624   f_stack = TYPE_FIELDS (va_list_type_node);
9625   f_grtop = DECL_CHAIN (f_stack);
9626   f_vrtop = DECL_CHAIN (f_grtop);
9627   f_groff = DECL_CHAIN (f_vrtop);
9628   f_vroff = DECL_CHAIN (f_groff);
9629
9630   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
9631                   NULL_TREE);
9632   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
9633                   NULL_TREE);
9634   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
9635                   NULL_TREE);
9636   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
9637                   NULL_TREE);
9638   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
9639                   NULL_TREE);
9640
9641   /* Emit code to initialize STACK, which points to the next varargs stack
9642      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
9643      by named arguments.  STACK is 8-byte aligned.  */
9644   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
9645   if (cum->aapcs_stack_size > 0)
9646     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
9647   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
9648   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9649
9650   /* Emit code to initialize GRTOP, the top of the GR save area.
9651      virtual_incoming_args_rtx should have been 16 byte aligned.  */
9652   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
9653   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
9654   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9655
9656   /* Emit code to initialize VRTOP, the top of the VR save area.
9657      This address is gr_save_area_bytes below GRTOP, rounded
9658      down to the next 16-byte boundary.  */
9659   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
9660   vr_offset = ROUND_UP (gr_save_area_size,
9661                         STACK_BOUNDARY / BITS_PER_UNIT);
9662
9663   if (vr_offset)
9664     t = fold_build_pointer_plus_hwi (t, -vr_offset);
9665   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
9666   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9667
9668   /* Emit code to initialize GROFF, the offset from GRTOP of the
9669      next GPR argument.  */
9670   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
9671               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
9672   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9673
9674   /* Likewise emit code to initialize VROFF, the offset from FTOP
9675      of the next VR argument.  */
9676   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
9677               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
9678   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9679 }
9680
9681 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
9682
9683 static tree
9684 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
9685                               gimple_seq *post_p ATTRIBUTE_UNUSED)
9686 {
9687   tree addr;
9688   bool indirect_p;
9689   bool is_ha;           /* is HFA or HVA.  */
9690   bool dw_align;        /* double-word align.  */
9691   machine_mode ag_mode = VOIDmode;
9692   int nregs;
9693   machine_mode mode;
9694
9695   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9696   tree stack, f_top, f_off, off, arg, roundup, on_stack;
9697   HOST_WIDE_INT size, rsize, adjust, align;
9698   tree t, u, cond1, cond2;
9699
9700   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9701   if (indirect_p)
9702     type = build_pointer_type (type);
9703
9704   mode = TYPE_MODE (type);
9705
9706   f_stack = TYPE_FIELDS (va_list_type_node);
9707   f_grtop = DECL_CHAIN (f_stack);
9708   f_vrtop = DECL_CHAIN (f_grtop);
9709   f_groff = DECL_CHAIN (f_vrtop);
9710   f_vroff = DECL_CHAIN (f_groff);
9711
9712   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
9713                   f_stack, NULL_TREE);
9714   size = int_size_in_bytes (type);
9715   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9716
9717   dw_align = false;
9718   adjust = 0;
9719   if (aarch64_vfp_is_call_or_return_candidate (mode,
9720                                                type,
9721                                                &ag_mode,
9722                                                &nregs,
9723                                                &is_ha))
9724     {
9725       /* TYPE passed in fp/simd registers.  */
9726       if (!TARGET_FLOAT)
9727         aarch64_err_no_fpadvsimd (mode, "varargs");
9728
9729       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9730                       unshare_expr (valist), f_vrtop, NULL_TREE);
9731       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9732                       unshare_expr (valist), f_vroff, NULL_TREE);
9733
9734       rsize = nregs * UNITS_PER_VREG;
9735
9736       if (is_ha)
9737         {
9738           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9739             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9740         }
9741       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9742                && size < UNITS_PER_VREG)
9743         {
9744           adjust = UNITS_PER_VREG - size;
9745         }
9746     }
9747   else
9748     {
9749       /* TYPE passed in general registers.  */
9750       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9751                       unshare_expr (valist), f_grtop, NULL_TREE);
9752       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9753                       unshare_expr (valist), f_groff, NULL_TREE);
9754       rsize = ROUND_UP (size, UNITS_PER_WORD);
9755       nregs = rsize / UNITS_PER_WORD;
9756
9757       if (align > 8)
9758         dw_align = true;
9759
9760       if (BLOCK_REG_PADDING (mode, type, 1) == downward
9761           && size < UNITS_PER_WORD)
9762         {
9763           adjust = UNITS_PER_WORD  - size;
9764         }
9765     }
9766
9767   /* Get a local temporary for the field value.  */
9768   off = get_initialized_tmp_var (f_off, pre_p, NULL);
9769
9770   /* Emit code to branch if off >= 0.  */
9771   t = build2 (GE_EXPR, boolean_type_node, off,
9772               build_int_cst (TREE_TYPE (off), 0));
9773   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9774
9775   if (dw_align)
9776     {
9777       /* Emit: offs = (offs + 15) & -16.  */
9778       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9779                   build_int_cst (TREE_TYPE (off), 15));
9780       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9781                   build_int_cst (TREE_TYPE (off), -16));
9782       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9783     }
9784   else
9785     roundup = NULL;
9786
9787   /* Update ap.__[g|v]r_offs  */
9788   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9789               build_int_cst (TREE_TYPE (off), rsize));
9790   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9791
9792   /* String up.  */
9793   if (roundup)
9794     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9795
9796   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
9797   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9798               build_int_cst (TREE_TYPE (f_off), 0));
9799   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9800
9801   /* String up: make sure the assignment happens before the use.  */
9802   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9803   COND_EXPR_ELSE (cond1) = t;
9804
9805   /* Prepare the trees handling the argument that is passed on the stack;
9806      the top level node will store in ON_STACK.  */
9807   arg = get_initialized_tmp_var (stack, pre_p, NULL);
9808   if (align > 8)
9809     {
9810       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
9811       t = fold_convert (intDI_type_node, arg);
9812       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9813                   build_int_cst (TREE_TYPE (t), 15));
9814       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9815                   build_int_cst (TREE_TYPE (t), -16));
9816       t = fold_convert (TREE_TYPE (arg), t);
9817       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9818     }
9819   else
9820     roundup = NULL;
9821   /* Advance ap.__stack  */
9822   t = fold_convert (intDI_type_node, arg);
9823   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9824               build_int_cst (TREE_TYPE (t), size + 7));
9825   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9826               build_int_cst (TREE_TYPE (t), -8));
9827   t = fold_convert (TREE_TYPE (arg), t);
9828   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9829   /* String up roundup and advance.  */
9830   if (roundup)
9831     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9832   /* String up with arg */
9833   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9834   /* Big-endianness related address adjustment.  */
9835   if (BLOCK_REG_PADDING (mode, type, 1) == downward
9836       && size < UNITS_PER_WORD)
9837   {
9838     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9839                 size_int (UNITS_PER_WORD - size));
9840     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9841   }
9842
9843   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9844   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9845
9846   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
9847   t = off;
9848   if (adjust)
9849     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
9850                 build_int_cst (TREE_TYPE (off), adjust));
9851
9852   t = fold_convert (sizetype, t);
9853   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
9854
9855   if (is_ha)
9856     {
9857       /* type ha; // treat as "struct {ftype field[n];}"
9858          ... [computing offs]
9859          for (i = 0; i <nregs; ++i, offs += 16)
9860            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
9861          return ha;  */
9862       int i;
9863       tree tmp_ha, field_t, field_ptr_t;
9864
9865       /* Declare a local variable.  */
9866       tmp_ha = create_tmp_var_raw (type, "ha");
9867       gimple_add_tmp_var (tmp_ha);
9868
9869       /* Establish the base type.  */
9870       switch (ag_mode)
9871         {
9872         case SFmode:
9873           field_t = float_type_node;
9874           field_ptr_t = float_ptr_type_node;
9875           break;
9876         case DFmode:
9877           field_t = double_type_node;
9878           field_ptr_t = double_ptr_type_node;
9879           break;
9880         case TFmode:
9881           field_t = long_double_type_node;
9882           field_ptr_t = long_double_ptr_type_node;
9883           break;
9884 /* The half precision and quad precision are not fully supported yet.  Enable
9885    the following code after the support is complete.  Need to find the correct
9886    type node for __fp16 *.  */
9887 #if 0
9888         case HFmode:
9889           field_t = float_type_node;
9890           field_ptr_t = float_ptr_type_node;
9891           break;
9892 #endif
9893         case V2SImode:
9894         case V4SImode:
9895             {
9896               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
9897               field_t = build_vector_type_for_mode (innertype, ag_mode);
9898               field_ptr_t = build_pointer_type (field_t);
9899             }
9900           break;
9901         default:
9902           gcc_assert (0);
9903         }
9904
9905       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
9906       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
9907       addr = t;
9908       t = fold_convert (field_ptr_t, addr);
9909       t = build2 (MODIFY_EXPR, field_t,
9910                   build1 (INDIRECT_REF, field_t, tmp_ha),
9911                   build1 (INDIRECT_REF, field_t, t));
9912
9913       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
9914       for (i = 1; i < nregs; ++i)
9915         {
9916           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
9917           u = fold_convert (field_ptr_t, addr);
9918           u = build2 (MODIFY_EXPR, field_t,
9919                       build2 (MEM_REF, field_t, tmp_ha,
9920                               build_int_cst (field_ptr_t,
9921                                              (i *
9922                                               int_size_in_bytes (field_t)))),
9923                       build1 (INDIRECT_REF, field_t, u));
9924           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
9925         }
9926
9927       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
9928       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
9929     }
9930
9931   COND_EXPR_ELSE (cond2) = t;
9932   addr = fold_convert (build_pointer_type (type), cond1);
9933   addr = build_va_arg_indirect_ref (addr);
9934
9935   if (indirect_p)
9936     addr = build_va_arg_indirect_ref (addr);
9937
9938   return addr;
9939 }
9940
9941 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
9942
9943 static void
9944 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9945                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
9946                                 int no_rtl)
9947 {
9948   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9949   CUMULATIVE_ARGS local_cum;
9950   int gr_saved = cfun->va_list_gpr_size;
9951   int vr_saved = cfun->va_list_fpr_size;
9952
9953   /* The caller has advanced CUM up to, but not beyond, the last named
9954      argument.  Advance a local copy of CUM past the last "real" named
9955      argument, to find out how many registers are left over.  */
9956   local_cum = *cum;
9957   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
9958
9959   /* Found out how many registers we need to save.
9960      Honor tree-stdvar analysis results.  */
9961   if (cfun->va_list_gpr_size)
9962     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
9963                     cfun->va_list_gpr_size / UNITS_PER_WORD);
9964   if (cfun->va_list_fpr_size)
9965     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
9966                     cfun->va_list_fpr_size / UNITS_PER_VREG);
9967
9968   if (!TARGET_FLOAT)
9969     {
9970       gcc_assert (local_cum.aapcs_nvrn == 0);
9971       vr_saved = 0;
9972     }
9973
9974   if (!no_rtl)
9975     {
9976       if (gr_saved > 0)
9977         {
9978           rtx ptr, mem;
9979
9980           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
9981           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
9982                                - gr_saved * UNITS_PER_WORD);
9983           mem = gen_frame_mem (BLKmode, ptr);
9984           set_mem_alias_set (mem, get_varargs_alias_set ());
9985
9986           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
9987                                mem, gr_saved);
9988         }
9989       if (vr_saved > 0)
9990         {
9991           /* We can't use move_block_from_reg, because it will use
9992              the wrong mode, storing D regs only.  */
9993           machine_mode mode = TImode;
9994           int off, i, vr_start;
9995
9996           /* Set OFF to the offset from virtual_incoming_args_rtx of
9997              the first vector register.  The VR save area lies below
9998              the GR one, and is aligned to 16 bytes.  */
9999           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10000                            STACK_BOUNDARY / BITS_PER_UNIT);
10001           off -= vr_saved * UNITS_PER_VREG;
10002
10003           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10004           for (i = 0; i < vr_saved; ++i)
10005             {
10006               rtx ptr, mem;
10007
10008               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10009               mem = gen_frame_mem (mode, ptr);
10010               set_mem_alias_set (mem, get_varargs_alias_set ());
10011               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10012               off += UNITS_PER_VREG;
10013             }
10014         }
10015     }
10016
10017   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10018      any complication of having crtl->args.pretend_args_size changed.  */
10019   cfun->machine->frame.saved_varargs_size
10020     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10021                  STACK_BOUNDARY / BITS_PER_UNIT)
10022        + vr_saved * UNITS_PER_VREG);
10023 }
10024
10025 static void
10026 aarch64_conditional_register_usage (void)
10027 {
10028   int i;
10029   if (!TARGET_FLOAT)
10030     {
10031       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10032         {
10033           fixed_regs[i] = 1;
10034           call_used_regs[i] = 1;
10035         }
10036     }
10037 }
10038
10039 /* Walk down the type tree of TYPE counting consecutive base elements.
10040    If *MODEP is VOIDmode, then set it to the first valid floating point
10041    type.  If a non-floating point type is found, or if a floating point
10042    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10043    otherwise return the count in the sub-tree.  */
10044 static int
10045 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10046 {
10047   machine_mode mode;
10048   HOST_WIDE_INT size;
10049
10050   switch (TREE_CODE (type))
10051     {
10052     case REAL_TYPE:
10053       mode = TYPE_MODE (type);
10054       if (mode != DFmode && mode != SFmode && mode != TFmode)
10055         return -1;
10056
10057       if (*modep == VOIDmode)
10058         *modep = mode;
10059
10060       if (*modep == mode)
10061         return 1;
10062
10063       break;
10064
10065     case COMPLEX_TYPE:
10066       mode = TYPE_MODE (TREE_TYPE (type));
10067       if (mode != DFmode && mode != SFmode && mode != TFmode)
10068         return -1;
10069
10070       if (*modep == VOIDmode)
10071         *modep = mode;
10072
10073       if (*modep == mode)
10074         return 2;
10075
10076       break;
10077
10078     case VECTOR_TYPE:
10079       /* Use V2SImode and V4SImode as representatives of all 64-bit
10080          and 128-bit vector types.  */
10081       size = int_size_in_bytes (type);
10082       switch (size)
10083         {
10084         case 8:
10085           mode = V2SImode;
10086           break;
10087         case 16:
10088           mode = V4SImode;
10089           break;
10090         default:
10091           return -1;
10092         }
10093
10094       if (*modep == VOIDmode)
10095         *modep = mode;
10096
10097       /* Vector modes are considered to be opaque: two vectors are
10098          equivalent for the purposes of being homogeneous aggregates
10099          if they are the same size.  */
10100       if (*modep == mode)
10101         return 1;
10102
10103       break;
10104
10105     case ARRAY_TYPE:
10106       {
10107         int count;
10108         tree index = TYPE_DOMAIN (type);
10109
10110         /* Can't handle incomplete types nor sizes that are not
10111            fixed.  */
10112         if (!COMPLETE_TYPE_P (type)
10113             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10114           return -1;
10115
10116         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10117         if (count == -1
10118             || !index
10119             || !TYPE_MAX_VALUE (index)
10120             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10121             || !TYPE_MIN_VALUE (index)
10122             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10123             || count < 0)
10124           return -1;
10125
10126         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10127                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10128
10129         /* There must be no padding.  */
10130         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10131           return -1;
10132
10133         return count;
10134       }
10135
10136     case RECORD_TYPE:
10137       {
10138         int count = 0;
10139         int sub_count;
10140         tree field;
10141
10142         /* Can't handle incomplete types nor sizes that are not
10143            fixed.  */
10144         if (!COMPLETE_TYPE_P (type)
10145             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10146           return -1;
10147
10148         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10149           {
10150             if (TREE_CODE (field) != FIELD_DECL)
10151               continue;
10152
10153             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10154             if (sub_count < 0)
10155               return -1;
10156             count += sub_count;
10157           }
10158
10159         /* There must be no padding.  */
10160         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10161           return -1;
10162
10163         return count;
10164       }
10165
10166     case UNION_TYPE:
10167     case QUAL_UNION_TYPE:
10168       {
10169         /* These aren't very interesting except in a degenerate case.  */
10170         int count = 0;
10171         int sub_count;
10172         tree field;
10173
10174         /* Can't handle incomplete types nor sizes that are not
10175            fixed.  */
10176         if (!COMPLETE_TYPE_P (type)
10177             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10178           return -1;
10179
10180         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10181           {
10182             if (TREE_CODE (field) != FIELD_DECL)
10183               continue;
10184
10185             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10186             if (sub_count < 0)
10187               return -1;
10188             count = count > sub_count ? count : sub_count;
10189           }
10190
10191         /* There must be no padding.  */
10192         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10193           return -1;
10194
10195         return count;
10196       }
10197
10198     default:
10199       break;
10200     }
10201
10202   return -1;
10203 }
10204
10205 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10206    type as described in AAPCS64 \S 4.1.2.
10207
10208    See the comment above aarch64_composite_type_p for the notes on MODE.  */
10209
10210 static bool
10211 aarch64_short_vector_p (const_tree type,
10212                         machine_mode mode)
10213 {
10214   HOST_WIDE_INT size = -1;
10215
10216   if (type && TREE_CODE (type) == VECTOR_TYPE)
10217     size = int_size_in_bytes (type);
10218   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10219             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10220     size = GET_MODE_SIZE (mode);
10221
10222   return (size == 8 || size == 16);
10223 }
10224
10225 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10226    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
10227    array types.  The C99 floating-point complex types are also considered
10228    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
10229    types, which are GCC extensions and out of the scope of AAPCS64, are
10230    treated as composite types here as well.
10231
10232    Note that MODE itself is not sufficient in determining whether a type
10233    is such a composite type or not.  This is because
10234    stor-layout.c:compute_record_mode may have already changed the MODE
10235    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
10236    structure with only one field may have its MODE set to the mode of the
10237    field.  Also an integer mode whose size matches the size of the
10238    RECORD_TYPE type may be used to substitute the original mode
10239    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
10240    solely relied on.  */
10241
10242 static bool
10243 aarch64_composite_type_p (const_tree type,
10244                           machine_mode mode)
10245 {
10246   if (aarch64_short_vector_p (type, mode))
10247     return false;
10248
10249   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10250     return true;
10251
10252   if (mode == BLKmode
10253       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10254       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10255     return true;
10256
10257   return false;
10258 }
10259
10260 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10261    shall be passed or returned in simd/fp register(s) (providing these
10262    parameter passing registers are available).
10263
10264    Upon successful return, *COUNT returns the number of needed registers,
10265    *BASE_MODE returns the mode of the individual register and when IS_HAF
10266    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10267    floating-point aggregate or a homogeneous short-vector aggregate.  */
10268
10269 static bool
10270 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10271                                          const_tree type,
10272                                          machine_mode *base_mode,
10273                                          int *count,
10274                                          bool *is_ha)
10275 {
10276   machine_mode new_mode = VOIDmode;
10277   bool composite_p = aarch64_composite_type_p (type, mode);
10278
10279   if (is_ha != NULL) *is_ha = false;
10280
10281   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10282       || aarch64_short_vector_p (type, mode))
10283     {
10284       *count = 1;
10285       new_mode = mode;
10286     }
10287   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10288     {
10289       if (is_ha != NULL) *is_ha = true;
10290       *count = 2;
10291       new_mode = GET_MODE_INNER (mode);
10292     }
10293   else if (type && composite_p)
10294     {
10295       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10296
10297       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10298         {
10299           if (is_ha != NULL) *is_ha = true;
10300           *count = ag_count;
10301         }
10302       else
10303         return false;
10304     }
10305   else
10306     return false;
10307
10308   *base_mode = new_mode;
10309   return true;
10310 }
10311
10312 /* Implement TARGET_STRUCT_VALUE_RTX.  */
10313
10314 static rtx
10315 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10316                           int incoming ATTRIBUTE_UNUSED)
10317 {
10318   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10319 }
10320
10321 /* Implements target hook vector_mode_supported_p.  */
10322 static bool
10323 aarch64_vector_mode_supported_p (machine_mode mode)
10324 {
10325   if (TARGET_SIMD
10326       && (mode == V4SImode  || mode == V8HImode
10327           || mode == V16QImode || mode == V2DImode
10328           || mode == V2SImode  || mode == V4HImode
10329           || mode == V8QImode || mode == V2SFmode
10330           || mode == V4SFmode || mode == V2DFmode
10331           || mode == V4HFmode || mode == V8HFmode
10332           || mode == V1DFmode))
10333     return true;
10334
10335   return false;
10336 }
10337
10338 /* Return appropriate SIMD container
10339    for MODE within a vector of WIDTH bits.  */
10340 static machine_mode
10341 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10342 {
10343   gcc_assert (width == 64 || width == 128);
10344   if (TARGET_SIMD)
10345     {
10346       if (width == 128)
10347         switch (mode)
10348           {
10349           case DFmode:
10350             return V2DFmode;
10351           case SFmode:
10352             return V4SFmode;
10353           case SImode:
10354             return V4SImode;
10355           case HImode:
10356             return V8HImode;
10357           case QImode:
10358             return V16QImode;
10359           case DImode:
10360             return V2DImode;
10361           default:
10362             break;
10363           }
10364       else
10365         switch (mode)
10366           {
10367           case SFmode:
10368             return V2SFmode;
10369           case SImode:
10370             return V2SImode;
10371           case HImode:
10372             return V4HImode;
10373           case QImode:
10374             return V8QImode;
10375           default:
10376             break;
10377           }
10378     }
10379   return word_mode;
10380 }
10381
10382 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
10383 static machine_mode
10384 aarch64_preferred_simd_mode (machine_mode mode)
10385 {
10386   return aarch64_simd_container_mode (mode, 128);
10387 }
10388
10389 /* Return the bitmask of possible vector sizes for the vectorizer
10390    to iterate over.  */
10391 static unsigned int
10392 aarch64_autovectorize_vector_sizes (void)
10393 {
10394   return (16 | 8);
10395 }
10396
10397 /* Implement TARGET_MANGLE_TYPE.  */
10398
10399 static const char *
10400 aarch64_mangle_type (const_tree type)
10401 {
10402   /* The AArch64 ABI documents say that "__va_list" has to be
10403      managled as if it is in the "std" namespace.  */
10404   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10405     return "St9__va_list";
10406
10407   /* Half-precision float.  */
10408   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10409     return "Dh";
10410
10411   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
10412      builtin types.  */
10413   if (TYPE_NAME (type) != NULL)
10414     return aarch64_mangle_builtin_type (type);
10415
10416   /* Use the default mangling.  */
10417   return NULL;
10418 }
10419
10420
10421 /* Return true if the rtx_insn contains a MEM RTX somewhere
10422    in it.  */
10423
10424 static bool
10425 has_memory_op (rtx_insn *mem_insn)
10426 {
10427   subrtx_iterator::array_type array;
10428   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10429     if (MEM_P (*iter))
10430       return true;
10431
10432   return false;
10433 }
10434
10435 /* Find the first rtx_insn before insn that will generate an assembly
10436    instruction.  */
10437
10438 static rtx_insn *
10439 aarch64_prev_real_insn (rtx_insn *insn)
10440 {
10441   if (!insn)
10442     return NULL;
10443
10444   do
10445     {
10446       insn = prev_real_insn (insn);
10447     }
10448   while (insn && recog_memoized (insn) < 0);
10449
10450   return insn;
10451 }
10452
10453 static bool
10454 is_madd_op (enum attr_type t1)
10455 {
10456   unsigned int i;
10457   /* A number of these may be AArch32 only.  */
10458   enum attr_type mlatypes[] = {
10459     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10460     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10461     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10462   };
10463
10464   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10465     {
10466       if (t1 == mlatypes[i])
10467         return true;
10468     }
10469
10470   return false;
10471 }
10472
10473 /* Check if there is a register dependency between a load and the insn
10474    for which we hold recog_data.  */
10475
10476 static bool
10477 dep_between_memop_and_curr (rtx memop)
10478 {
10479   rtx load_reg;
10480   int opno;
10481
10482   gcc_assert (GET_CODE (memop) == SET);
10483
10484   if (!REG_P (SET_DEST (memop)))
10485     return false;
10486
10487   load_reg = SET_DEST (memop);
10488   for (opno = 1; opno < recog_data.n_operands; opno++)
10489     {
10490       rtx operand = recog_data.operand[opno];
10491       if (REG_P (operand)
10492           && reg_overlap_mentioned_p (load_reg, operand))
10493         return true;
10494
10495     }
10496   return false;
10497 }
10498
10499
10500 /* When working around the Cortex-A53 erratum 835769,
10501    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10502    instruction and has a preceding memory instruction such that a NOP
10503    should be inserted between them.  */
10504
10505 bool
10506 aarch64_madd_needs_nop (rtx_insn* insn)
10507 {
10508   enum attr_type attr_type;
10509   rtx_insn *prev;
10510   rtx body;
10511
10512   if (!TARGET_FIX_ERR_A53_835769)
10513     return false;
10514
10515   if (!INSN_P (insn) || recog_memoized (insn) < 0)
10516     return false;
10517
10518   attr_type = get_attr_type (insn);
10519   if (!is_madd_op (attr_type))
10520     return false;
10521
10522   prev = aarch64_prev_real_insn (insn);
10523   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10524      Restore recog state to INSN to avoid state corruption.  */
10525   extract_constrain_insn_cached (insn);
10526
10527   if (!prev || !has_memory_op (prev))
10528     return false;
10529
10530   body = single_set (prev);
10531
10532   /* If the previous insn is a memory op and there is no dependency between
10533      it and the DImode madd, emit a NOP between them.  If body is NULL then we
10534      have a complex memory operation, probably a load/store pair.
10535      Be conservative for now and emit a NOP.  */
10536   if (GET_MODE (recog_data.operand[0]) == DImode
10537       && (!body || !dep_between_memop_and_curr (body)))
10538     return true;
10539
10540   return false;
10541
10542 }
10543
10544
10545 /* Implement FINAL_PRESCAN_INSN.  */
10546
10547 void
10548 aarch64_final_prescan_insn (rtx_insn *insn)
10549 {
10550   if (aarch64_madd_needs_nop (insn))
10551     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10552 }
10553
10554
10555 /* Return the equivalent letter for size.  */
10556 static char
10557 sizetochar (int size)
10558 {
10559   switch (size)
10560     {
10561     case 64: return 'd';
10562     case 32: return 's';
10563     case 16: return 'h';
10564     case 8 : return 'b';
10565     default: gcc_unreachable ();
10566     }
10567 }
10568
10569 /* Return true iff x is a uniform vector of floating-point
10570    constants, and the constant can be represented in
10571    quarter-precision form.  Note, as aarch64_float_const_representable
10572    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
10573 static bool
10574 aarch64_vect_float_const_representable_p (rtx x)
10575 {
10576   rtx elt;
10577   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10578           && const_vec_duplicate_p (x, &elt)
10579           && aarch64_float_const_representable_p (elt));
10580 }
10581
10582 /* Return true for valid and false for invalid.  */
10583 bool
10584 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10585                               struct simd_immediate_info *info)
10586 {
10587 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
10588   matches = 1;                                          \
10589   for (i = 0; i < idx; i += (STRIDE))                   \
10590     if (!(TEST))                                        \
10591       matches = 0;                                      \
10592   if (matches)                                          \
10593     {                                                   \
10594       immtype = (CLASS);                                \
10595       elsize = (ELSIZE);                                \
10596       eshift = (SHIFT);                                 \
10597       emvn = (NEG);                                     \
10598       break;                                            \
10599     }
10600
10601   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
10602   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
10603   unsigned char bytes[16];
10604   int immtype = -1, matches;
10605   unsigned int invmask = inverse ? 0xff : 0;
10606   int eshift, emvn;
10607
10608   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10609     {
10610       if (! (aarch64_simd_imm_zero_p (op, mode)
10611              || aarch64_vect_float_const_representable_p (op)))
10612         return false;
10613
10614       if (info)
10615         {
10616           info->value = CONST_VECTOR_ELT (op, 0);
10617           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
10618           info->mvn = false;
10619           info->shift = 0;
10620         }
10621
10622       return true;
10623     }
10624
10625   /* Splat vector constant out into a byte vector.  */
10626   for (i = 0; i < n_elts; i++)
10627     {
10628       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
10629          it must be laid out in the vector register in reverse order.  */
10630       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
10631       unsigned HOST_WIDE_INT elpart;
10632
10633       gcc_assert (CONST_INT_P (el));
10634       elpart = INTVAL (el);
10635
10636       for (unsigned int byte = 0; byte < innersize; byte++)
10637         {
10638           bytes[idx++] = (elpart & 0xff) ^ invmask;
10639           elpart >>= BITS_PER_UNIT;
10640         }
10641
10642     }
10643
10644   /* Sanity check.  */
10645   gcc_assert (idx == GET_MODE_SIZE (mode));
10646
10647   do
10648     {
10649       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
10650              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
10651
10652       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10653              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10654
10655       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
10656              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10657
10658       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
10659              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
10660
10661       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
10662
10663       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
10664
10665       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
10666              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
10667
10668       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10669              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10670
10671       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
10672              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10673
10674       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
10675              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
10676
10677       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
10678
10679       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
10680
10681       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10682              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10683
10684       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10685              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10686
10687       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10688              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10689
10690       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10691              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10692
10693       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10694
10695       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10696              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10697     }
10698   while (0);
10699
10700   if (immtype == -1)
10701     return false;
10702
10703   if (info)
10704     {
10705       info->element_width = elsize;
10706       info->mvn = emvn != 0;
10707       info->shift = eshift;
10708
10709       unsigned HOST_WIDE_INT imm = 0;
10710
10711       if (immtype >= 12 && immtype <= 15)
10712         info->msl = true;
10713
10714       /* Un-invert bytes of recognized vector, if necessary.  */
10715       if (invmask != 0)
10716         for (i = 0; i < idx; i++)
10717           bytes[i] ^= invmask;
10718
10719       if (immtype == 17)
10720         {
10721           /* FIXME: Broken on 32-bit H_W_I hosts.  */
10722           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10723
10724           for (i = 0; i < 8; i++)
10725             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10726               << (i * BITS_PER_UNIT);
10727
10728
10729           info->value = GEN_INT (imm);
10730         }
10731       else
10732         {
10733           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10734             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10735
10736           /* Construct 'abcdefgh' because the assembler cannot handle
10737              generic constants.  */
10738           if (info->mvn)
10739             imm = ~imm;
10740           imm = (imm >> info->shift) & 0xff;
10741           info->value = GEN_INT (imm);
10742         }
10743     }
10744
10745   return true;
10746 #undef CHECK
10747 }
10748
10749 /* Check of immediate shift constants are within range.  */
10750 bool
10751 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10752 {
10753   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10754   if (left)
10755     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10756   else
10757     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10758 }
10759
10760 /* Return true if X is a uniform vector where all elements
10761    are either the floating-point constant 0.0 or the
10762    integer constant 0.  */
10763 bool
10764 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10765 {
10766   return x == CONST0_RTX (mode);
10767 }
10768
10769
10770 /* Return the bitmask CONST_INT to select the bits required by a zero extract
10771    operation of width WIDTH at bit position POS.  */
10772
10773 rtx
10774 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
10775 {
10776   gcc_assert (CONST_INT_P (width));
10777   gcc_assert (CONST_INT_P (pos));
10778
10779   unsigned HOST_WIDE_INT mask
10780     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
10781   return GEN_INT (mask << UINTVAL (pos));
10782 }
10783
10784 bool
10785 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10786 {
10787   HOST_WIDE_INT imm = INTVAL (x);
10788   int i;
10789
10790   for (i = 0; i < 8; i++)
10791     {
10792       unsigned int byte = imm & 0xff;
10793       if (byte != 0xff && byte != 0)
10794        return false;
10795       imm >>= 8;
10796     }
10797
10798   return true;
10799 }
10800
10801 bool
10802 aarch64_mov_operand_p (rtx x, machine_mode mode)
10803 {
10804   if (GET_CODE (x) == HIGH
10805       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10806     return true;
10807
10808   if (CONST_INT_P (x))
10809     return true;
10810
10811   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10812     return true;
10813
10814   return aarch64_classify_symbolic_expression (x)
10815     == SYMBOL_TINY_ABSOLUTE;
10816 }
10817
10818 /* Return a const_int vector of VAL.  */
10819 rtx
10820 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10821 {
10822   int nunits = GET_MODE_NUNITS (mode);
10823   rtvec v = rtvec_alloc (nunits);
10824   int i;
10825
10826   for (i=0; i < nunits; i++)
10827     RTVEC_ELT (v, i) = GEN_INT (val);
10828
10829   return gen_rtx_CONST_VECTOR (mode, v);
10830 }
10831
10832 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
10833
10834 bool
10835 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10836 {
10837   machine_mode vmode;
10838
10839   gcc_assert (!VECTOR_MODE_P (mode));
10840   vmode = aarch64_preferred_simd_mode (mode);
10841   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10842   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10843 }
10844
10845 /* Construct and return a PARALLEL RTX vector with elements numbering the
10846    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10847    the vector - from the perspective of the architecture.  This does not
10848    line up with GCC's perspective on lane numbers, so we end up with
10849    different masks depending on our target endian-ness.  The diagram
10850    below may help.  We must draw the distinction when building masks
10851    which select one half of the vector.  An instruction selecting
10852    architectural low-lanes for a big-endian target, must be described using
10853    a mask selecting GCC high-lanes.
10854
10855                  Big-Endian             Little-Endian
10856
10857 GCC             0   1   2   3           3   2   1   0
10858               | x | x | x | x |       | x | x | x | x |
10859 Architecture    3   2   1   0           3   2   1   0
10860
10861 Low Mask:         { 2, 3 }                { 0, 1 }
10862 High Mask:        { 0, 1 }                { 2, 3 }
10863 */
10864
10865 rtx
10866 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
10867 {
10868   int nunits = GET_MODE_NUNITS (mode);
10869   rtvec v = rtvec_alloc (nunits / 2);
10870   int high_base = nunits / 2;
10871   int low_base = 0;
10872   int base;
10873   rtx t1;
10874   int i;
10875
10876   if (BYTES_BIG_ENDIAN)
10877     base = high ? low_base : high_base;
10878   else
10879     base = high ? high_base : low_base;
10880
10881   for (i = 0; i < nunits / 2; i++)
10882     RTVEC_ELT (v, i) = GEN_INT (base + i);
10883
10884   t1 = gen_rtx_PARALLEL (mode, v);
10885   return t1;
10886 }
10887
10888 /* Check OP for validity as a PARALLEL RTX vector with elements
10889    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
10890    from the perspective of the architecture.  See the diagram above
10891    aarch64_simd_vect_par_cnst_half for more details.  */
10892
10893 bool
10894 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
10895                                        bool high)
10896 {
10897   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
10898   HOST_WIDE_INT count_op = XVECLEN (op, 0);
10899   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
10900   int i = 0;
10901
10902   if (!VECTOR_MODE_P (mode))
10903     return false;
10904
10905   if (count_op != count_ideal)
10906     return false;
10907
10908   for (i = 0; i < count_ideal; i++)
10909     {
10910       rtx elt_op = XVECEXP (op, 0, i);
10911       rtx elt_ideal = XVECEXP (ideal, 0, i);
10912
10913       if (!CONST_INT_P (elt_op)
10914           || INTVAL (elt_ideal) != INTVAL (elt_op))
10915         return false;
10916     }
10917   return true;
10918 }
10919
10920 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
10921    HIGH (exclusive).  */
10922 void
10923 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
10924                           const_tree exp)
10925 {
10926   HOST_WIDE_INT lane;
10927   gcc_assert (CONST_INT_P (operand));
10928   lane = INTVAL (operand);
10929
10930   if (lane < low || lane >= high)
10931   {
10932     if (exp)
10933       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
10934     else
10935       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
10936   }
10937 }
10938
10939 /* Return TRUE if OP is a valid vector addressing mode.  */
10940 bool
10941 aarch64_simd_mem_operand_p (rtx op)
10942 {
10943   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
10944                         || REG_P (XEXP (op, 0)));
10945 }
10946
10947 /* Emit a register copy from operand to operand, taking care not to
10948    early-clobber source registers in the process.
10949
10950    COUNT is the number of components into which the copy needs to be
10951    decomposed.  */
10952 void
10953 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
10954                                 unsigned int count)
10955 {
10956   unsigned int i;
10957   int rdest = REGNO (operands[0]);
10958   int rsrc = REGNO (operands[1]);
10959
10960   if (!reg_overlap_mentioned_p (operands[0], operands[1])
10961       || rdest < rsrc)
10962     for (i = 0; i < count; i++)
10963       emit_move_insn (gen_rtx_REG (mode, rdest + i),
10964                       gen_rtx_REG (mode, rsrc + i));
10965   else
10966     for (i = 0; i < count; i++)
10967       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
10968                       gen_rtx_REG (mode, rsrc + count - i - 1));
10969 }
10970
10971 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
10972    one of VSTRUCT modes: OI, CI, or XI.  */
10973 int
10974 aarch64_simd_attr_length_rglist (enum machine_mode mode)
10975 {
10976   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
10977 }
10978
10979 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
10980    alignment of a vector to 128 bits.  */
10981 static HOST_WIDE_INT
10982 aarch64_simd_vector_alignment (const_tree type)
10983 {
10984   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
10985   return MIN (align, 128);
10986 }
10987
10988 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
10989 static bool
10990 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
10991 {
10992   if (is_packed)
10993     return false;
10994
10995   /* We guarantee alignment for vectors up to 128-bits.  */
10996   if (tree_int_cst_compare (TYPE_SIZE (type),
10997                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
10998     return false;
10999
11000   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11001   return true;
11002 }
11003
11004 /* If VALS is a vector constant that can be loaded into a register
11005    using DUP, generate instructions to do so and return an RTX to
11006    assign to the register.  Otherwise return NULL_RTX.  */
11007 static rtx
11008 aarch64_simd_dup_constant (rtx vals)
11009 {
11010   machine_mode mode = GET_MODE (vals);
11011   machine_mode inner_mode = GET_MODE_INNER (mode);
11012   rtx x;
11013
11014   if (!const_vec_duplicate_p (vals, &x))
11015     return NULL_RTX;
11016
11017   /* We can load this constant by using DUP and a constant in a
11018      single ARM register.  This will be cheaper than a vector
11019      load.  */
11020   x = copy_to_mode_reg (inner_mode, x);
11021   return gen_rtx_VEC_DUPLICATE (mode, x);
11022 }
11023
11024
11025 /* Generate code to load VALS, which is a PARALLEL containing only
11026    constants (for vec_init) or CONST_VECTOR, efficiently into a
11027    register.  Returns an RTX to copy into the register, or NULL_RTX
11028    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11029 static rtx
11030 aarch64_simd_make_constant (rtx vals)
11031 {
11032   machine_mode mode = GET_MODE (vals);
11033   rtx const_dup;
11034   rtx const_vec = NULL_RTX;
11035   int n_elts = GET_MODE_NUNITS (mode);
11036   int n_const = 0;
11037   int i;
11038
11039   if (GET_CODE (vals) == CONST_VECTOR)
11040     const_vec = vals;
11041   else if (GET_CODE (vals) == PARALLEL)
11042     {
11043       /* A CONST_VECTOR must contain only CONST_INTs and
11044          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11045          Only store valid constants in a CONST_VECTOR.  */
11046       for (i = 0; i < n_elts; ++i)
11047         {
11048           rtx x = XVECEXP (vals, 0, i);
11049           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11050             n_const++;
11051         }
11052       if (n_const == n_elts)
11053         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11054     }
11055   else
11056     gcc_unreachable ();
11057
11058   if (const_vec != NULL_RTX
11059       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11060     /* Load using MOVI/MVNI.  */
11061     return const_vec;
11062   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11063     /* Loaded using DUP.  */
11064     return const_dup;
11065   else if (const_vec != NULL_RTX)
11066     /* Load from constant pool. We can not take advantage of single-cycle
11067        LD1 because we need a PC-relative addressing mode.  */
11068     return const_vec;
11069   else
11070     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11071        We can not construct an initializer.  */
11072     return NULL_RTX;
11073 }
11074
11075 /* Expand a vector initialisation sequence, such that TARGET is
11076    initialised to contain VALS.  */
11077
11078 void
11079 aarch64_expand_vector_init (rtx target, rtx vals)
11080 {
11081   machine_mode mode = GET_MODE (target);
11082   machine_mode inner_mode = GET_MODE_INNER (mode);
11083   /* The number of vector elements.  */
11084   int n_elts = GET_MODE_NUNITS (mode);
11085   /* The number of vector elements which are not constant.  */
11086   int n_var = 0;
11087   rtx any_const = NULL_RTX;
11088   /* The first element of vals.  */
11089   rtx v0 = XVECEXP (vals, 0, 0);
11090   bool all_same = true;
11091
11092   /* Count the number of variable elements to initialise.  */
11093   for (int i = 0; i < n_elts; ++i)
11094     {
11095       rtx x = XVECEXP (vals, 0, i);
11096       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11097         ++n_var;
11098       else
11099         any_const = x;
11100
11101       all_same &= rtx_equal_p (x, v0);
11102     }
11103
11104   /* No variable elements, hand off to aarch64_simd_make_constant which knows
11105      how best to handle this.  */
11106   if (n_var == 0)
11107     {
11108       rtx constant = aarch64_simd_make_constant (vals);
11109       if (constant != NULL_RTX)
11110         {
11111           emit_move_insn (target, constant);
11112           return;
11113         }
11114     }
11115
11116   /* Splat a single non-constant element if we can.  */
11117   if (all_same)
11118     {
11119       rtx x = copy_to_mode_reg (inner_mode, v0);
11120       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11121       return;
11122     }
11123
11124   /* Initialise a vector which is part-variable.  We want to first try
11125      to build those lanes which are constant in the most efficient way we
11126      can.  */
11127   if (n_var != n_elts)
11128     {
11129       rtx copy = copy_rtx (vals);
11130
11131       /* Load constant part of vector.  We really don't care what goes into the
11132          parts we will overwrite, but we're more likely to be able to load the
11133          constant efficiently if it has fewer, larger, repeating parts
11134          (see aarch64_simd_valid_immediate).  */
11135       for (int i = 0; i < n_elts; i++)
11136         {
11137           rtx x = XVECEXP (vals, 0, i);
11138           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11139             continue;
11140           rtx subst = any_const;
11141           for (int bit = n_elts / 2; bit > 0; bit /= 2)
11142             {
11143               /* Look in the copied vector, as more elements are const.  */
11144               rtx test = XVECEXP (copy, 0, i ^ bit);
11145               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11146                 {
11147                   subst = test;
11148                   break;
11149                 }
11150             }
11151           XVECEXP (copy, 0, i) = subst;
11152         }
11153       aarch64_expand_vector_init (target, copy);
11154     }
11155
11156   /* Insert the variable lanes directly.  */
11157
11158   enum insn_code icode = optab_handler (vec_set_optab, mode);
11159   gcc_assert (icode != CODE_FOR_nothing);
11160
11161   for (int i = 0; i < n_elts; i++)
11162     {
11163       rtx x = XVECEXP (vals, 0, i);
11164       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11165         continue;
11166       x = copy_to_mode_reg (inner_mode, x);
11167       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11168     }
11169 }
11170
11171 static unsigned HOST_WIDE_INT
11172 aarch64_shift_truncation_mask (machine_mode mode)
11173 {
11174   return
11175     (!SHIFT_COUNT_TRUNCATED
11176      || aarch64_vector_mode_supported_p (mode)
11177      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11178 }
11179
11180 /* Select a format to encode pointers in exception handling data.  */
11181 int
11182 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11183 {
11184    int type;
11185    switch (aarch64_cmodel)
11186      {
11187      case AARCH64_CMODEL_TINY:
11188      case AARCH64_CMODEL_TINY_PIC:
11189      case AARCH64_CMODEL_SMALL:
11190      case AARCH64_CMODEL_SMALL_PIC:
11191      case AARCH64_CMODEL_SMALL_SPIC:
11192        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
11193           for everything.  */
11194        type = DW_EH_PE_sdata4;
11195        break;
11196      default:
11197        /* No assumptions here.  8-byte relocs required.  */
11198        type = DW_EH_PE_sdata8;
11199        break;
11200      }
11201    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11202 }
11203
11204 /* The last .arch and .tune assembly strings that we printed.  */
11205 static std::string aarch64_last_printed_arch_string;
11206 static std::string aarch64_last_printed_tune_string;
11207
11208 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
11209    by the function fndecl.  */
11210
11211 void
11212 aarch64_declare_function_name (FILE *stream, const char* name,
11213                                 tree fndecl)
11214 {
11215   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11216
11217   struct cl_target_option *targ_options;
11218   if (target_parts)
11219     targ_options = TREE_TARGET_OPTION (target_parts);
11220   else
11221     targ_options = TREE_TARGET_OPTION (target_option_current_node);
11222   gcc_assert (targ_options);
11223
11224   const struct processor *this_arch
11225     = aarch64_get_arch (targ_options->x_explicit_arch);
11226
11227   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11228   std::string extension
11229     = aarch64_get_extension_string_for_isa_flags (isa_flags,
11230                                                   this_arch->flags);
11231   /* Only update the assembler .arch string if it is distinct from the last
11232      such string we printed.  */
11233   std::string to_print = this_arch->name + extension;
11234   if (to_print != aarch64_last_printed_arch_string)
11235     {
11236       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11237       aarch64_last_printed_arch_string = to_print;
11238     }
11239
11240   /* Print the cpu name we're tuning for in the comments, might be
11241      useful to readers of the generated asm.  Do it only when it changes
11242      from function to function and verbose assembly is requested.  */
11243   const struct processor *this_tune
11244     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11245
11246   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11247     {
11248       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11249                    this_tune->name);
11250       aarch64_last_printed_tune_string = this_tune->name;
11251     }
11252
11253   /* Don't forget the type directive for ELF.  */
11254   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11255   ASM_OUTPUT_LABEL (stream, name);
11256 }
11257
11258 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
11259
11260 static void
11261 aarch64_start_file (void)
11262 {
11263   struct cl_target_option *default_options
11264     = TREE_TARGET_OPTION (target_option_default_node);
11265
11266   const struct processor *default_arch
11267     = aarch64_get_arch (default_options->x_explicit_arch);
11268   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11269   std::string extension
11270     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11271                                                   default_arch->flags);
11272
11273    aarch64_last_printed_arch_string = default_arch->name + extension;
11274    aarch64_last_printed_tune_string = "";
11275    asm_fprintf (asm_out_file, "\t.arch %s\n",
11276                 aarch64_last_printed_arch_string.c_str ());
11277
11278    default_file_start ();
11279 }
11280
11281 /* Emit load exclusive.  */
11282
11283 static void
11284 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11285                              rtx mem, rtx model_rtx)
11286 {
11287   rtx (*gen) (rtx, rtx, rtx);
11288
11289   switch (mode)
11290     {
11291     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11292     case HImode: gen = gen_aarch64_load_exclusivehi; break;
11293     case SImode: gen = gen_aarch64_load_exclusivesi; break;
11294     case DImode: gen = gen_aarch64_load_exclusivedi; break;
11295     default:
11296       gcc_unreachable ();
11297     }
11298
11299   emit_insn (gen (rval, mem, model_rtx));
11300 }
11301
11302 /* Emit store exclusive.  */
11303
11304 static void
11305 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11306                               rtx rval, rtx mem, rtx model_rtx)
11307 {
11308   rtx (*gen) (rtx, rtx, rtx, rtx);
11309
11310   switch (mode)
11311     {
11312     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11313     case HImode: gen = gen_aarch64_store_exclusivehi; break;
11314     case SImode: gen = gen_aarch64_store_exclusivesi; break;
11315     case DImode: gen = gen_aarch64_store_exclusivedi; break;
11316     default:
11317       gcc_unreachable ();
11318     }
11319
11320   emit_insn (gen (bval, rval, mem, model_rtx));
11321 }
11322
11323 /* Mark the previous jump instruction as unlikely.  */
11324
11325 static void
11326 aarch64_emit_unlikely_jump (rtx insn)
11327 {
11328   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11329
11330   insn = emit_jump_insn (insn);
11331   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
11332 }
11333
11334 /* Expand a compare and swap pattern.  */
11335
11336 void
11337 aarch64_expand_compare_and_swap (rtx operands[])
11338 {
11339   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11340   machine_mode mode, cmp_mode;
11341   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11342   int idx;
11343   gen_cas_fn gen;
11344   const gen_cas_fn split_cas[] =
11345   {
11346     gen_aarch64_compare_and_swapqi,
11347     gen_aarch64_compare_and_swaphi,
11348     gen_aarch64_compare_and_swapsi,
11349     gen_aarch64_compare_and_swapdi
11350   };
11351   const gen_cas_fn atomic_cas[] =
11352   {
11353     gen_aarch64_compare_and_swapqi_lse,
11354     gen_aarch64_compare_and_swaphi_lse,
11355     gen_aarch64_compare_and_swapsi_lse,
11356     gen_aarch64_compare_and_swapdi_lse
11357   };
11358
11359   bval = operands[0];
11360   rval = operands[1];
11361   mem = operands[2];
11362   oldval = operands[3];
11363   newval = operands[4];
11364   is_weak = operands[5];
11365   mod_s = operands[6];
11366   mod_f = operands[7];
11367   mode = GET_MODE (mem);
11368   cmp_mode = mode;
11369
11370   /* Normally the succ memory model must be stronger than fail, but in the
11371      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11372      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
11373
11374   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11375       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11376     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11377
11378   switch (mode)
11379     {
11380     case QImode:
11381     case HImode:
11382       /* For short modes, we're going to perform the comparison in SImode,
11383          so do the zero-extension now.  */
11384       cmp_mode = SImode;
11385       rval = gen_reg_rtx (SImode);
11386       oldval = convert_modes (SImode, mode, oldval, true);
11387       /* Fall through.  */
11388
11389     case SImode:
11390     case DImode:
11391       /* Force the value into a register if needed.  */
11392       if (!aarch64_plus_operand (oldval, mode))
11393         oldval = force_reg (cmp_mode, oldval);
11394       break;
11395
11396     default:
11397       gcc_unreachable ();
11398     }
11399
11400   switch (mode)
11401     {
11402     case QImode: idx = 0; break;
11403     case HImode: idx = 1; break;
11404     case SImode: idx = 2; break;
11405     case DImode: idx = 3; break;
11406     default:
11407       gcc_unreachable ();
11408     }
11409   if (TARGET_LSE)
11410     gen = atomic_cas[idx];
11411   else
11412     gen = split_cas[idx];
11413
11414   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11415
11416   if (mode == QImode || mode == HImode)
11417     emit_move_insn (operands[1], gen_lowpart (mode, rval));
11418
11419   x = gen_rtx_REG (CCmode, CC_REGNUM);
11420   x = gen_rtx_EQ (SImode, x, const0_rtx);
11421   emit_insn (gen_rtx_SET (bval, x));
11422 }
11423
11424 /* Test whether the target supports using a atomic load-operate instruction.
11425    CODE is the operation and AFTER is TRUE if the data in memory after the
11426    operation should be returned and FALSE if the data before the operation
11427    should be returned.  Returns FALSE if the operation isn't supported by the
11428    architecture.  */
11429
11430 bool
11431 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11432 {
11433   if (!TARGET_LSE)
11434     return false;
11435
11436   switch (code)
11437     {
11438     case SET:
11439     case AND:
11440     case IOR:
11441     case XOR:
11442     case MINUS:
11443     case PLUS:
11444       return true;
11445     default:
11446       return false;
11447     }
11448 }
11449
11450 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11451    sequence implementing an atomic operation.  */
11452
11453 static void
11454 aarch64_emit_post_barrier (enum memmodel model)
11455 {
11456   const enum memmodel base_model = memmodel_base (model);
11457
11458   if (is_mm_sync (model)
11459       && (base_model == MEMMODEL_ACQUIRE
11460           || base_model == MEMMODEL_ACQ_REL
11461           || base_model == MEMMODEL_SEQ_CST))
11462     {
11463       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11464     }
11465 }
11466
11467 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
11468    for the data in memory.  EXPECTED is the value expected to be in memory.
11469    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
11470    is the memory ordering to use.  */
11471
11472 void
11473 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11474                         rtx expected, rtx desired,
11475                         rtx model)
11476 {
11477   rtx (*gen) (rtx, rtx, rtx, rtx);
11478   machine_mode mode;
11479
11480   mode = GET_MODE (mem);
11481
11482   switch (mode)
11483     {
11484     case QImode: gen = gen_aarch64_atomic_casqi; break;
11485     case HImode: gen = gen_aarch64_atomic_cashi; break;
11486     case SImode: gen = gen_aarch64_atomic_cassi; break;
11487     case DImode: gen = gen_aarch64_atomic_casdi; break;
11488     default:
11489       gcc_unreachable ();
11490     }
11491
11492   /* Move the expected value into the CAS destination register.  */
11493   emit_insn (gen_rtx_SET (rval, expected));
11494
11495   /* Emit the CAS.  */
11496   emit_insn (gen (rval, mem, desired, model));
11497
11498   /* Compare the expected value with the value loaded by the CAS, to establish
11499      whether the swap was made.  */
11500   aarch64_gen_compare_reg (EQ, rval, expected);
11501 }
11502
11503 /* Split a compare and swap pattern.  */
11504
11505 void
11506 aarch64_split_compare_and_swap (rtx operands[])
11507 {
11508   rtx rval, mem, oldval, newval, scratch;
11509   machine_mode mode;
11510   bool is_weak;
11511   rtx_code_label *label1, *label2;
11512   rtx x, cond;
11513   enum memmodel model;
11514   rtx model_rtx;
11515
11516   rval = operands[0];
11517   mem = operands[1];
11518   oldval = operands[2];
11519   newval = operands[3];
11520   is_weak = (operands[4] != const0_rtx);
11521   model_rtx = operands[5];
11522   scratch = operands[7];
11523   mode = GET_MODE (mem);
11524   model = memmodel_from_int (INTVAL (model_rtx));
11525
11526   label1 = NULL;
11527   if (!is_weak)
11528     {
11529       label1 = gen_label_rtx ();
11530       emit_label (label1);
11531     }
11532   label2 = gen_label_rtx ();
11533
11534   /* The initial load can be relaxed for a __sync operation since a final
11535      barrier will be emitted to stop code hoisting.  */
11536   if (is_mm_sync (model))
11537     aarch64_emit_load_exclusive (mode, rval, mem,
11538                                  GEN_INT (MEMMODEL_RELAXED));
11539   else
11540     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11541
11542   cond = aarch64_gen_compare_reg (NE, rval, oldval);
11543   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11544   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11545                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11546   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11547
11548   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11549
11550   if (!is_weak)
11551     {
11552       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11553       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11554                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11555       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11556     }
11557   else
11558     {
11559       cond = gen_rtx_REG (CCmode, CC_REGNUM);
11560       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11561       emit_insn (gen_rtx_SET (cond, x));
11562     }
11563
11564   emit_label (label2);
11565
11566   /* Emit any final barrier needed for a __sync operation.  */
11567   if (is_mm_sync (model))
11568     aarch64_emit_post_barrier (model);
11569 }
11570
11571 /* Emit a BIC instruction.  */
11572
11573 static void
11574 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11575 {
11576   rtx shift_rtx = GEN_INT (shift);
11577   rtx (*gen) (rtx, rtx, rtx, rtx);
11578
11579   switch (mode)
11580     {
11581     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11582     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11583     default:
11584       gcc_unreachable ();
11585     }
11586
11587   emit_insn (gen (dst, s2, shift_rtx, s1));
11588 }
11589
11590 /* Emit an atomic swap.  */
11591
11592 static void
11593 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
11594                           rtx mem, rtx model)
11595 {
11596   rtx (*gen) (rtx, rtx, rtx, rtx);
11597
11598   switch (mode)
11599     {
11600     case QImode: gen = gen_aarch64_atomic_swpqi; break;
11601     case HImode: gen = gen_aarch64_atomic_swphi; break;
11602     case SImode: gen = gen_aarch64_atomic_swpsi; break;
11603     case DImode: gen = gen_aarch64_atomic_swpdi; break;
11604     default:
11605       gcc_unreachable ();
11606     }
11607
11608   emit_insn (gen (dst, mem, value, model));
11609 }
11610
11611 /* Operations supported by aarch64_emit_atomic_load_op.  */
11612
11613 enum aarch64_atomic_load_op_code
11614 {
11615   AARCH64_LDOP_PLUS,    /* A + B  */
11616   AARCH64_LDOP_XOR,     /* A ^ B  */
11617   AARCH64_LDOP_OR,      /* A | B  */
11618   AARCH64_LDOP_BIC      /* A & ~B  */
11619 };
11620
11621 /* Emit an atomic load-operate.  */
11622
11623 static void
11624 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
11625                              machine_mode mode, rtx dst, rtx src,
11626                              rtx mem, rtx model)
11627 {
11628   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
11629   const aarch64_atomic_load_op_fn plus[] =
11630   {
11631     gen_aarch64_atomic_loadaddqi,
11632     gen_aarch64_atomic_loadaddhi,
11633     gen_aarch64_atomic_loadaddsi,
11634     gen_aarch64_atomic_loadadddi
11635   };
11636   const aarch64_atomic_load_op_fn eor[] =
11637   {
11638     gen_aarch64_atomic_loadeorqi,
11639     gen_aarch64_atomic_loadeorhi,
11640     gen_aarch64_atomic_loadeorsi,
11641     gen_aarch64_atomic_loadeordi
11642   };
11643   const aarch64_atomic_load_op_fn ior[] =
11644   {
11645     gen_aarch64_atomic_loadsetqi,
11646     gen_aarch64_atomic_loadsethi,
11647     gen_aarch64_atomic_loadsetsi,
11648     gen_aarch64_atomic_loadsetdi
11649   };
11650   const aarch64_atomic_load_op_fn bic[] =
11651   {
11652     gen_aarch64_atomic_loadclrqi,
11653     gen_aarch64_atomic_loadclrhi,
11654     gen_aarch64_atomic_loadclrsi,
11655     gen_aarch64_atomic_loadclrdi
11656   };
11657   aarch64_atomic_load_op_fn gen;
11658   int idx = 0;
11659
11660   switch (mode)
11661     {
11662     case QImode: idx = 0; break;
11663     case HImode: idx = 1; break;
11664     case SImode: idx = 2; break;
11665     case DImode: idx = 3; break;
11666     default:
11667       gcc_unreachable ();
11668     }
11669
11670   switch (code)
11671     {
11672     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
11673     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
11674     case AARCH64_LDOP_OR: gen = ior[idx]; break;
11675     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
11676     default:
11677       gcc_unreachable ();
11678     }
11679
11680   emit_insn (gen (dst, mem, src, model));
11681 }
11682
11683 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
11684    location to store the data read from memory.  OUT_RESULT is the location to
11685    store the result of the operation.  MEM is the memory location to read and
11686    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
11687    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
11688    be NULL.  */
11689
11690 void
11691 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
11692                          rtx mem, rtx value, rtx model_rtx)
11693 {
11694   machine_mode mode = GET_MODE (mem);
11695   machine_mode wmode = (mode == DImode ? DImode : SImode);
11696   const bool short_mode = (mode < SImode);
11697   aarch64_atomic_load_op_code ldop_code;
11698   rtx src;
11699   rtx x;
11700
11701   if (out_data)
11702     out_data = gen_lowpart (mode, out_data);
11703
11704   if (out_result)
11705     out_result = gen_lowpart (mode, out_result);
11706
11707   /* Make sure the value is in a register, putting it into a destination
11708      register if it needs to be manipulated.  */
11709   if (!register_operand (value, mode)
11710       || code == AND || code == MINUS)
11711     {
11712       src = out_result ? out_result : out_data;
11713       emit_move_insn (src, gen_lowpart (mode, value));
11714     }
11715   else
11716     src = value;
11717   gcc_assert (register_operand (src, mode));
11718
11719   /* Preprocess the data for the operation as necessary.  If the operation is
11720      a SET then emit a swap instruction and finish.  */
11721   switch (code)
11722     {
11723     case SET:
11724       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
11725       return;
11726
11727     case MINUS:
11728       /* Negate the value and treat it as a PLUS.  */
11729       {
11730         rtx neg_src;
11731
11732         /* Resize the value if necessary.  */
11733         if (short_mode)
11734           src = gen_lowpart (wmode, src);
11735
11736         neg_src = gen_rtx_NEG (wmode, src);
11737         emit_insn (gen_rtx_SET (src, neg_src));
11738
11739         if (short_mode)
11740           src = gen_lowpart (mode, src);
11741       }
11742       /* Fall-through.  */
11743     case PLUS:
11744       ldop_code = AARCH64_LDOP_PLUS;
11745       break;
11746
11747     case IOR:
11748       ldop_code = AARCH64_LDOP_OR;
11749       break;
11750
11751     case XOR:
11752       ldop_code = AARCH64_LDOP_XOR;
11753       break;
11754
11755     case AND:
11756       {
11757         rtx not_src;
11758
11759         /* Resize the value if necessary.  */
11760         if (short_mode)
11761           src = gen_lowpart (wmode, src);
11762
11763         not_src = gen_rtx_NOT (wmode, src);
11764         emit_insn (gen_rtx_SET (src, not_src));
11765
11766         if (short_mode)
11767           src = gen_lowpart (mode, src);
11768       }
11769       ldop_code = AARCH64_LDOP_BIC;
11770       break;
11771
11772     default:
11773       /* The operation can't be done with atomic instructions.  */
11774       gcc_unreachable ();
11775     }
11776
11777   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
11778
11779   /* If necessary, calculate the data in memory after the update by redoing the
11780      operation from values in registers.  */
11781   if (!out_result)
11782     return;
11783
11784   if (short_mode)
11785     {
11786       src = gen_lowpart (wmode, src);
11787       out_data = gen_lowpart (wmode, out_data);
11788       out_result = gen_lowpart (wmode, out_result);
11789     }
11790
11791   x = NULL_RTX;
11792
11793   switch (code)
11794     {
11795     case MINUS:
11796     case PLUS:
11797       x = gen_rtx_PLUS (wmode, out_data, src);
11798       break;
11799     case IOR:
11800       x = gen_rtx_IOR (wmode, out_data, src);
11801       break;
11802     case XOR:
11803       x = gen_rtx_XOR (wmode, out_data, src);
11804       break;
11805     case AND:
11806       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
11807       return;
11808     default:
11809       gcc_unreachable ();
11810     }
11811
11812   emit_set_insn (out_result, x);
11813
11814   return;
11815 }
11816
11817 /* Split an atomic operation.  */
11818
11819 void
11820 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
11821                          rtx value, rtx model_rtx, rtx cond)
11822 {
11823   machine_mode mode = GET_MODE (mem);
11824   machine_mode wmode = (mode == DImode ? DImode : SImode);
11825   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
11826   const bool is_sync = is_mm_sync (model);
11827   rtx_code_label *label;
11828   rtx x;
11829
11830   /* Split the atomic operation into a sequence.  */
11831   label = gen_label_rtx ();
11832   emit_label (label);
11833
11834   if (new_out)
11835     new_out = gen_lowpart (wmode, new_out);
11836   if (old_out)
11837     old_out = gen_lowpart (wmode, old_out);
11838   else
11839     old_out = new_out;
11840   value = simplify_gen_subreg (wmode, value, mode, 0);
11841
11842   /* The initial load can be relaxed for a __sync operation since a final
11843      barrier will be emitted to stop code hoisting.  */
11844  if (is_sync)
11845     aarch64_emit_load_exclusive (mode, old_out, mem,
11846                                  GEN_INT (MEMMODEL_RELAXED));
11847   else
11848     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
11849
11850   switch (code)
11851     {
11852     case SET:
11853       new_out = value;
11854       break;
11855
11856     case NOT:
11857       x = gen_rtx_AND (wmode, old_out, value);
11858       emit_insn (gen_rtx_SET (new_out, x));
11859       x = gen_rtx_NOT (wmode, new_out);
11860       emit_insn (gen_rtx_SET (new_out, x));
11861       break;
11862
11863     case MINUS:
11864       if (CONST_INT_P (value))
11865         {
11866           value = GEN_INT (-INTVAL (value));
11867           code = PLUS;
11868         }
11869       /* Fall through.  */
11870
11871     default:
11872       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
11873       emit_insn (gen_rtx_SET (new_out, x));
11874       break;
11875     }
11876
11877   aarch64_emit_store_exclusive (mode, cond, mem,
11878                                 gen_lowpart (mode, new_out), model_rtx);
11879
11880   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11881   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11882                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
11883   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11884
11885   /* Emit any final barrier needed for a __sync operation.  */
11886   if (is_sync)
11887     aarch64_emit_post_barrier (model);
11888 }
11889
11890 static void
11891 aarch64_init_libfuncs (void)
11892 {
11893    /* Half-precision float operations.  The compiler handles all operations
11894      with NULL libfuncs by converting to SFmode.  */
11895
11896   /* Conversions.  */
11897   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
11898   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
11899
11900   /* Arithmetic.  */
11901   set_optab_libfunc (add_optab, HFmode, NULL);
11902   set_optab_libfunc (sdiv_optab, HFmode, NULL);
11903   set_optab_libfunc (smul_optab, HFmode, NULL);
11904   set_optab_libfunc (neg_optab, HFmode, NULL);
11905   set_optab_libfunc (sub_optab, HFmode, NULL);
11906
11907   /* Comparisons.  */
11908   set_optab_libfunc (eq_optab, HFmode, NULL);
11909   set_optab_libfunc (ne_optab, HFmode, NULL);
11910   set_optab_libfunc (lt_optab, HFmode, NULL);
11911   set_optab_libfunc (le_optab, HFmode, NULL);
11912   set_optab_libfunc (ge_optab, HFmode, NULL);
11913   set_optab_libfunc (gt_optab, HFmode, NULL);
11914   set_optab_libfunc (unord_optab, HFmode, NULL);
11915 }
11916
11917 /* Target hook for c_mode_for_suffix.  */
11918 static machine_mode
11919 aarch64_c_mode_for_suffix (char suffix)
11920 {
11921   if (suffix == 'q')
11922     return TFmode;
11923
11924   return VOIDmode;
11925 }
11926
11927 /* We can only represent floating point constants which will fit in
11928    "quarter-precision" values.  These values are characterised by
11929    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
11930    by:
11931
11932    (-1)^s * (n/16) * 2^r
11933
11934    Where:
11935      's' is the sign bit.
11936      'n' is an integer in the range 16 <= n <= 31.
11937      'r' is an integer in the range -3 <= r <= 4.  */
11938
11939 /* Return true iff X can be represented by a quarter-precision
11940    floating point immediate operand X.  Note, we cannot represent 0.0.  */
11941 bool
11942 aarch64_float_const_representable_p (rtx x)
11943 {
11944   /* This represents our current view of how many bits
11945      make up the mantissa.  */
11946   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
11947   int exponent;
11948   unsigned HOST_WIDE_INT mantissa, mask;
11949   REAL_VALUE_TYPE r, m;
11950   bool fail;
11951
11952   if (!CONST_DOUBLE_P (x))
11953     return false;
11954
11955   /* We don't support HFmode constants yet.  */
11956   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
11957     return false;
11958
11959   r = *CONST_DOUBLE_REAL_VALUE (x);
11960
11961   /* We cannot represent infinities, NaNs or +/-zero.  We won't
11962      know if we have +zero until we analyse the mantissa, but we
11963      can reject the other invalid values.  */
11964   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
11965       || REAL_VALUE_MINUS_ZERO (r))
11966     return false;
11967
11968   /* Extract exponent.  */
11969   r = real_value_abs (&r);
11970   exponent = REAL_EXP (&r);
11971
11972   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
11973      highest (sign) bit, with a fixed binary point at bit point_pos.
11974      m1 holds the low part of the mantissa, m2 the high part.
11975      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
11976      bits for the mantissa, this can fail (low bits will be lost).  */
11977   real_ldexp (&m, &r, point_pos - exponent);
11978   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
11979
11980   /* If the low part of the mantissa has bits set we cannot represent
11981      the value.  */
11982   if (w.elt (0) != 0)
11983     return false;
11984   /* We have rejected the lower HOST_WIDE_INT, so update our
11985      understanding of how many bits lie in the mantissa and
11986      look only at the high HOST_WIDE_INT.  */
11987   mantissa = w.elt (1);
11988   point_pos -= HOST_BITS_PER_WIDE_INT;
11989
11990   /* We can only represent values with a mantissa of the form 1.xxxx.  */
11991   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
11992   if ((mantissa & mask) != 0)
11993     return false;
11994
11995   /* Having filtered unrepresentable values, we may now remove all
11996      but the highest 5 bits.  */
11997   mantissa >>= point_pos - 5;
11998
11999   /* We cannot represent the value 0.0, so reject it.  This is handled
12000      elsewhere.  */
12001   if (mantissa == 0)
12002     return false;
12003
12004   /* Then, as bit 4 is always set, we can mask it off, leaving
12005      the mantissa in the range [0, 15].  */
12006   mantissa &= ~(1 << 4);
12007   gcc_assert (mantissa <= 15);
12008
12009   /* GCC internally does not use IEEE754-like encoding (where normalized
12010      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12011      Our mantissa values are shifted 4 places to the left relative to
12012      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12013      by 5 places to correct for GCC's representation.  */
12014   exponent = 5 - exponent;
12015
12016   return (exponent >= 0 && exponent <= 7);
12017 }
12018
12019 char*
12020 aarch64_output_simd_mov_immediate (rtx const_vector,
12021                                    machine_mode mode,
12022                                    unsigned width)
12023 {
12024   bool is_valid;
12025   static char templ[40];
12026   const char *mnemonic;
12027   const char *shift_op;
12028   unsigned int lane_count = 0;
12029   char element_char;
12030
12031   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12032
12033   /* This will return true to show const_vector is legal for use as either
12034      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
12035      also update INFO to show how the immediate should be generated.  */
12036   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12037   gcc_assert (is_valid);
12038
12039   element_char = sizetochar (info.element_width);
12040   lane_count = width / info.element_width;
12041
12042   mode = GET_MODE_INNER (mode);
12043   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12044     {
12045       gcc_assert (info.shift == 0 && ! info.mvn);
12046       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12047          move immediate path.  */
12048       if (aarch64_float_const_zero_rtx_p (info.value))
12049         info.value = GEN_INT (0);
12050       else
12051         {
12052           const unsigned int buf_size = 20;
12053           char float_buf[buf_size] = {'\0'};
12054           real_to_decimal_for_mode (float_buf,
12055                                     CONST_DOUBLE_REAL_VALUE (info.value),
12056                                     buf_size, buf_size, 1, mode);
12057
12058           if (lane_count == 1)
12059             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12060           else
12061             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12062                       lane_count, element_char, float_buf);
12063           return templ;
12064         }
12065     }
12066
12067   mnemonic = info.mvn ? "mvni" : "movi";
12068   shift_op = info.msl ? "msl" : "lsl";
12069
12070   gcc_assert (CONST_INT_P (info.value));
12071   if (lane_count == 1)
12072     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12073               mnemonic, UINTVAL (info.value));
12074   else if (info.shift)
12075     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12076               ", %s %d", mnemonic, lane_count, element_char,
12077               UINTVAL (info.value), shift_op, info.shift);
12078   else
12079     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12080               mnemonic, lane_count, element_char, UINTVAL (info.value));
12081   return templ;
12082 }
12083
12084 char*
12085 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12086                                           machine_mode mode)
12087 {
12088   machine_mode vmode;
12089
12090   gcc_assert (!VECTOR_MODE_P (mode));
12091   vmode = aarch64_simd_container_mode (mode, 64);
12092   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12093   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12094 }
12095
12096 /* Split operands into moves from op[1] + op[2] into op[0].  */
12097
12098 void
12099 aarch64_split_combinev16qi (rtx operands[3])
12100 {
12101   unsigned int dest = REGNO (operands[0]);
12102   unsigned int src1 = REGNO (operands[1]);
12103   unsigned int src2 = REGNO (operands[2]);
12104   machine_mode halfmode = GET_MODE (operands[1]);
12105   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12106   rtx destlo, desthi;
12107
12108   gcc_assert (halfmode == V16QImode);
12109
12110   if (src1 == dest && src2 == dest + halfregs)
12111     {
12112       /* No-op move.  Can't split to nothing; emit something.  */
12113       emit_note (NOTE_INSN_DELETED);
12114       return;
12115     }
12116
12117   /* Preserve register attributes for variable tracking.  */
12118   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12119   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12120                                GET_MODE_SIZE (halfmode));
12121
12122   /* Special case of reversed high/low parts.  */
12123   if (reg_overlap_mentioned_p (operands[2], destlo)
12124       && reg_overlap_mentioned_p (operands[1], desthi))
12125     {
12126       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12127       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12128       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12129     }
12130   else if (!reg_overlap_mentioned_p (operands[2], destlo))
12131     {
12132       /* Try to avoid unnecessary moves if part of the result
12133          is in the right place already.  */
12134       if (src1 != dest)
12135         emit_move_insn (destlo, operands[1]);
12136       if (src2 != dest + halfregs)
12137         emit_move_insn (desthi, operands[2]);
12138     }
12139   else
12140     {
12141       if (src2 != dest + halfregs)
12142         emit_move_insn (desthi, operands[2]);
12143       if (src1 != dest)
12144         emit_move_insn (destlo, operands[1]);
12145     }
12146 }
12147
12148 /* vec_perm support.  */
12149
12150 #define MAX_VECT_LEN 16
12151
12152 struct expand_vec_perm_d
12153 {
12154   rtx target, op0, op1;
12155   unsigned char perm[MAX_VECT_LEN];
12156   machine_mode vmode;
12157   unsigned char nelt;
12158   bool one_vector_p;
12159   bool testing_p;
12160 };
12161
12162 /* Generate a variable permutation.  */
12163
12164 static void
12165 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12166 {
12167   machine_mode vmode = GET_MODE (target);
12168   bool one_vector_p = rtx_equal_p (op0, op1);
12169
12170   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12171   gcc_checking_assert (GET_MODE (op0) == vmode);
12172   gcc_checking_assert (GET_MODE (op1) == vmode);
12173   gcc_checking_assert (GET_MODE (sel) == vmode);
12174   gcc_checking_assert (TARGET_SIMD);
12175
12176   if (one_vector_p)
12177     {
12178       if (vmode == V8QImode)
12179         {
12180           /* Expand the argument to a V16QI mode by duplicating it.  */
12181           rtx pair = gen_reg_rtx (V16QImode);
12182           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12183           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12184         }
12185       else
12186         {
12187           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12188         }
12189     }
12190   else
12191     {
12192       rtx pair;
12193
12194       if (vmode == V8QImode)
12195         {
12196           pair = gen_reg_rtx (V16QImode);
12197           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12198           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12199         }
12200       else
12201         {
12202           pair = gen_reg_rtx (OImode);
12203           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12204           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12205         }
12206     }
12207 }
12208
12209 void
12210 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12211 {
12212   machine_mode vmode = GET_MODE (target);
12213   unsigned int nelt = GET_MODE_NUNITS (vmode);
12214   bool one_vector_p = rtx_equal_p (op0, op1);
12215   rtx mask;
12216
12217   /* The TBL instruction does not use a modulo index, so we must take care
12218      of that ourselves.  */
12219   mask = aarch64_simd_gen_const_vector_dup (vmode,
12220       one_vector_p ? nelt - 1 : 2 * nelt - 1);
12221   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12222
12223   /* For big-endian, we also need to reverse the index within the vector
12224      (but not which vector).  */
12225   if (BYTES_BIG_ENDIAN)
12226     {
12227       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
12228       if (!one_vector_p)
12229         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12230       sel = expand_simple_binop (vmode, XOR, sel, mask,
12231                                  NULL, 0, OPTAB_LIB_WIDEN);
12232     }
12233   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12234 }
12235
12236 /* Recognize patterns suitable for the TRN instructions.  */
12237 static bool
12238 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12239 {
12240   unsigned int i, odd, mask, nelt = d->nelt;
12241   rtx out, in0, in1, x;
12242   rtx (*gen) (rtx, rtx, rtx);
12243   machine_mode vmode = d->vmode;
12244
12245   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12246     return false;
12247
12248   /* Note that these are little-endian tests.
12249      We correct for big-endian later.  */
12250   if (d->perm[0] == 0)
12251     odd = 0;
12252   else if (d->perm[0] == 1)
12253     odd = 1;
12254   else
12255     return false;
12256   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12257
12258   for (i = 0; i < nelt; i += 2)
12259     {
12260       if (d->perm[i] != i + odd)
12261         return false;
12262       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12263         return false;
12264     }
12265
12266   /* Success!  */
12267   if (d->testing_p)
12268     return true;
12269
12270   in0 = d->op0;
12271   in1 = d->op1;
12272   if (BYTES_BIG_ENDIAN)
12273     {
12274       x = in0, in0 = in1, in1 = x;
12275       odd = !odd;
12276     }
12277   out = d->target;
12278
12279   if (odd)
12280     {
12281       switch (vmode)
12282         {
12283         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12284         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12285         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12286         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12287         case V4SImode: gen = gen_aarch64_trn2v4si; break;
12288         case V2SImode: gen = gen_aarch64_trn2v2si; break;
12289         case V2DImode: gen = gen_aarch64_trn2v2di; break;
12290         case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12291         case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12292         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12293         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12294         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12295         default:
12296           return false;
12297         }
12298     }
12299   else
12300     {
12301       switch (vmode)
12302         {
12303         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12304         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12305         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12306         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12307         case V4SImode: gen = gen_aarch64_trn1v4si; break;
12308         case V2SImode: gen = gen_aarch64_trn1v2si; break;
12309         case V2DImode: gen = gen_aarch64_trn1v2di; break;
12310         case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12311         case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12312         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12313         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12314         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12315         default:
12316           return false;
12317         }
12318     }
12319
12320   emit_insn (gen (out, in0, in1));
12321   return true;
12322 }
12323
12324 /* Recognize patterns suitable for the UZP instructions.  */
12325 static bool
12326 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12327 {
12328   unsigned int i, odd, mask, nelt = d->nelt;
12329   rtx out, in0, in1, x;
12330   rtx (*gen) (rtx, rtx, rtx);
12331   machine_mode vmode = d->vmode;
12332
12333   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12334     return false;
12335
12336   /* Note that these are little-endian tests.
12337      We correct for big-endian later.  */
12338   if (d->perm[0] == 0)
12339     odd = 0;
12340   else if (d->perm[0] == 1)
12341     odd = 1;
12342   else
12343     return false;
12344   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12345
12346   for (i = 0; i < nelt; i++)
12347     {
12348       unsigned elt = (i * 2 + odd) & mask;
12349       if (d->perm[i] != elt)
12350         return false;
12351     }
12352
12353   /* Success!  */
12354   if (d->testing_p)
12355     return true;
12356
12357   in0 = d->op0;
12358   in1 = d->op1;
12359   if (BYTES_BIG_ENDIAN)
12360     {
12361       x = in0, in0 = in1, in1 = x;
12362       odd = !odd;
12363     }
12364   out = d->target;
12365
12366   if (odd)
12367     {
12368       switch (vmode)
12369         {
12370         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12371         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12372         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12373         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12374         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12375         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12376         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12377         case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12378         case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12379         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12380         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12381         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12382         default:
12383           return false;
12384         }
12385     }
12386   else
12387     {
12388       switch (vmode)
12389         {
12390         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12391         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12392         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12393         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12394         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12395         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12396         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12397         case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12398         case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12399         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12400         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12401         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12402         default:
12403           return false;
12404         }
12405     }
12406
12407   emit_insn (gen (out, in0, in1));
12408   return true;
12409 }
12410
12411 /* Recognize patterns suitable for the ZIP instructions.  */
12412 static bool
12413 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12414 {
12415   unsigned int i, high, mask, nelt = d->nelt;
12416   rtx out, in0, in1, x;
12417   rtx (*gen) (rtx, rtx, rtx);
12418   machine_mode vmode = d->vmode;
12419
12420   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12421     return false;
12422
12423   /* Note that these are little-endian tests.
12424      We correct for big-endian later.  */
12425   high = nelt / 2;
12426   if (d->perm[0] == high)
12427     /* Do Nothing.  */
12428     ;
12429   else if (d->perm[0] == 0)
12430     high = 0;
12431   else
12432     return false;
12433   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12434
12435   for (i = 0; i < nelt / 2; i++)
12436     {
12437       unsigned elt = (i + high) & mask;
12438       if (d->perm[i * 2] != elt)
12439         return false;
12440       elt = (elt + nelt) & mask;
12441       if (d->perm[i * 2 + 1] != elt)
12442         return false;
12443     }
12444
12445   /* Success!  */
12446   if (d->testing_p)
12447     return true;
12448
12449   in0 = d->op0;
12450   in1 = d->op1;
12451   if (BYTES_BIG_ENDIAN)
12452     {
12453       x = in0, in0 = in1, in1 = x;
12454       high = !high;
12455     }
12456   out = d->target;
12457
12458   if (high)
12459     {
12460       switch (vmode)
12461         {
12462         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12463         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12464         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12465         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12466         case V4SImode: gen = gen_aarch64_zip2v4si; break;
12467         case V2SImode: gen = gen_aarch64_zip2v2si; break;
12468         case V2DImode: gen = gen_aarch64_zip2v2di; break;
12469         case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12470         case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12471         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12472         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12473         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12474         default:
12475           return false;
12476         }
12477     }
12478   else
12479     {
12480       switch (vmode)
12481         {
12482         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12483         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12484         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12485         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12486         case V4SImode: gen = gen_aarch64_zip1v4si; break;
12487         case V2SImode: gen = gen_aarch64_zip1v2si; break;
12488         case V2DImode: gen = gen_aarch64_zip1v2di; break;
12489         case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
12490         case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
12491         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12492         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12493         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12494         default:
12495           return false;
12496         }
12497     }
12498
12499   emit_insn (gen (out, in0, in1));
12500   return true;
12501 }
12502
12503 /* Recognize patterns for the EXT insn.  */
12504
12505 static bool
12506 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12507 {
12508   unsigned int i, nelt = d->nelt;
12509   rtx (*gen) (rtx, rtx, rtx, rtx);
12510   rtx offset;
12511
12512   unsigned int location = d->perm[0]; /* Always < nelt.  */
12513
12514   /* Check if the extracted indices are increasing by one.  */
12515   for (i = 1; i < nelt; i++)
12516     {
12517       unsigned int required = location + i;
12518       if (d->one_vector_p)
12519         {
12520           /* We'll pass the same vector in twice, so allow indices to wrap.  */
12521           required &= (nelt - 1);
12522         }
12523       if (d->perm[i] != required)
12524         return false;
12525     }
12526
12527   switch (d->vmode)
12528     {
12529     case V16QImode: gen = gen_aarch64_extv16qi; break;
12530     case V8QImode: gen = gen_aarch64_extv8qi; break;
12531     case V4HImode: gen = gen_aarch64_extv4hi; break;
12532     case V8HImode: gen = gen_aarch64_extv8hi; break;
12533     case V2SImode: gen = gen_aarch64_extv2si; break;
12534     case V4SImode: gen = gen_aarch64_extv4si; break;
12535     case V4HFmode: gen = gen_aarch64_extv4hf; break;
12536     case V8HFmode: gen = gen_aarch64_extv8hf; break;
12537     case V2SFmode: gen = gen_aarch64_extv2sf; break;
12538     case V4SFmode: gen = gen_aarch64_extv4sf; break;
12539     case V2DImode: gen = gen_aarch64_extv2di; break;
12540     case V2DFmode: gen = gen_aarch64_extv2df; break;
12541     default:
12542       return false;
12543     }
12544
12545   /* Success! */
12546   if (d->testing_p)
12547     return true;
12548
12549   /* The case where (location == 0) is a no-op for both big- and little-endian,
12550      and is removed by the mid-end at optimization levels -O1 and higher.  */
12551
12552   if (BYTES_BIG_ENDIAN && (location != 0))
12553     {
12554       /* After setup, we want the high elements of the first vector (stored
12555          at the LSB end of the register), and the low elements of the second
12556          vector (stored at the MSB end of the register). So swap.  */
12557       std::swap (d->op0, d->op1);
12558       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
12559       location = nelt - location;
12560     }
12561
12562   offset = GEN_INT (location);
12563   emit_insn (gen (d->target, d->op0, d->op1, offset));
12564   return true;
12565 }
12566
12567 /* Recognize patterns for the REV insns.  */
12568
12569 static bool
12570 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12571 {
12572   unsigned int i, j, diff, nelt = d->nelt;
12573   rtx (*gen) (rtx, rtx);
12574
12575   if (!d->one_vector_p)
12576     return false;
12577
12578   diff = d->perm[0];
12579   switch (diff)
12580     {
12581     case 7:
12582       switch (d->vmode)
12583         {
12584         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12585         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
12586         default:
12587           return false;
12588         }
12589       break;
12590     case 3:
12591       switch (d->vmode)
12592         {
12593         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
12594         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
12595         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
12596         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
12597         default:
12598           return false;
12599         }
12600       break;
12601     case 1:
12602       switch (d->vmode)
12603         {
12604         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
12605         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
12606         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
12607         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
12608         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
12609         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
12610         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
12611         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
12612         case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
12613         case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
12614         default:
12615           return false;
12616         }
12617       break;
12618     default:
12619       return false;
12620     }
12621
12622   for (i = 0; i < nelt ; i += diff + 1)
12623     for (j = 0; j <= diff; j += 1)
12624       {
12625         /* This is guaranteed to be true as the value of diff
12626            is 7, 3, 1 and we should have enough elements in the
12627            queue to generate this.  Getting a vector mask with a
12628            value of diff other than these values implies that
12629            something is wrong by the time we get here.  */
12630         gcc_assert (i + j < nelt);
12631         if (d->perm[i + j] != i + diff - j)
12632           return false;
12633       }
12634
12635   /* Success! */
12636   if (d->testing_p)
12637     return true;
12638
12639   emit_insn (gen (d->target, d->op0));
12640   return true;
12641 }
12642
12643 static bool
12644 aarch64_evpc_dup (struct expand_vec_perm_d *d)
12645 {
12646   rtx (*gen) (rtx, rtx, rtx);
12647   rtx out = d->target;
12648   rtx in0;
12649   machine_mode vmode = d->vmode;
12650   unsigned int i, elt, nelt = d->nelt;
12651   rtx lane;
12652
12653   elt = d->perm[0];
12654   for (i = 1; i < nelt; i++)
12655     {
12656       if (elt != d->perm[i])
12657         return false;
12658     }
12659
12660   /* The generic preparation in aarch64_expand_vec_perm_const_1
12661      swaps the operand order and the permute indices if it finds
12662      d->perm[0] to be in the second operand.  Thus, we can always
12663      use d->op0 and need not do any extra arithmetic to get the
12664      correct lane number.  */
12665   in0 = d->op0;
12666   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
12667
12668   switch (vmode)
12669     {
12670     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
12671     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
12672     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
12673     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
12674     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
12675     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
12676     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
12677     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
12678     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
12679     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
12680     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
12681     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
12682     default:
12683       return false;
12684     }
12685
12686   emit_insn (gen (out, in0, lane));
12687   return true;
12688 }
12689
12690 static bool
12691 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
12692 {
12693   rtx rperm[MAX_VECT_LEN], sel;
12694   machine_mode vmode = d->vmode;
12695   unsigned int i, nelt = d->nelt;
12696
12697   if (d->testing_p)
12698     return true;
12699
12700   /* Generic code will try constant permutation twice.  Once with the
12701      original mode and again with the elements lowered to QImode.
12702      So wait and don't do the selector expansion ourselves.  */
12703   if (vmode != V8QImode && vmode != V16QImode)
12704     return false;
12705
12706   for (i = 0; i < nelt; ++i)
12707     {
12708       int nunits = GET_MODE_NUNITS (vmode);
12709
12710       /* If big-endian and two vectors we end up with a weird mixed-endian
12711          mode on NEON.  Reverse the index within each word but not the word
12712          itself.  */
12713       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
12714                                            : d->perm[i]);
12715     }
12716   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
12717   sel = force_reg (vmode, sel);
12718
12719   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
12720   return true;
12721 }
12722
12723 static bool
12724 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
12725 {
12726   /* The pattern matching functions above are written to look for a small
12727      number to begin the sequence (0, 1, N/2).  If we begin with an index
12728      from the second operand, we can swap the operands.  */
12729   if (d->perm[0] >= d->nelt)
12730     {
12731       unsigned i, nelt = d->nelt;
12732
12733       gcc_assert (nelt == (nelt & -nelt));
12734       for (i = 0; i < nelt; ++i)
12735         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
12736
12737       std::swap (d->op0, d->op1);
12738     }
12739
12740   if (TARGET_SIMD)
12741     {
12742       if (aarch64_evpc_rev (d))
12743         return true;
12744       else if (aarch64_evpc_ext (d))
12745         return true;
12746       else if (aarch64_evpc_dup (d))
12747         return true;
12748       else if (aarch64_evpc_zip (d))
12749         return true;
12750       else if (aarch64_evpc_uzp (d))
12751         return true;
12752       else if (aarch64_evpc_trn (d))
12753         return true;
12754       return aarch64_evpc_tbl (d);
12755     }
12756   return false;
12757 }
12758
12759 /* Expand a vec_perm_const pattern.  */
12760
12761 bool
12762 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
12763 {
12764   struct expand_vec_perm_d d;
12765   int i, nelt, which;
12766
12767   d.target = target;
12768   d.op0 = op0;
12769   d.op1 = op1;
12770
12771   d.vmode = GET_MODE (target);
12772   gcc_assert (VECTOR_MODE_P (d.vmode));
12773   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12774   d.testing_p = false;
12775
12776   for (i = which = 0; i < nelt; ++i)
12777     {
12778       rtx e = XVECEXP (sel, 0, i);
12779       int ei = INTVAL (e) & (2 * nelt - 1);
12780       which |= (ei < nelt ? 1 : 2);
12781       d.perm[i] = ei;
12782     }
12783
12784   switch (which)
12785     {
12786     default:
12787       gcc_unreachable ();
12788
12789     case 3:
12790       d.one_vector_p = false;
12791       if (!rtx_equal_p (op0, op1))
12792         break;
12793
12794       /* The elements of PERM do not suggest that only the first operand
12795          is used, but both operands are identical.  Allow easier matching
12796          of the permutation by folding the permutation into the single
12797          input vector.  */
12798       /* Fall Through.  */
12799     case 2:
12800       for (i = 0; i < nelt; ++i)
12801         d.perm[i] &= nelt - 1;
12802       d.op0 = op1;
12803       d.one_vector_p = true;
12804       break;
12805
12806     case 1:
12807       d.op1 = op0;
12808       d.one_vector_p = true;
12809       break;
12810     }
12811
12812   return aarch64_expand_vec_perm_const_1 (&d);
12813 }
12814
12815 static bool
12816 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
12817                                      const unsigned char *sel)
12818 {
12819   struct expand_vec_perm_d d;
12820   unsigned int i, nelt, which;
12821   bool ret;
12822
12823   d.vmode = vmode;
12824   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12825   d.testing_p = true;
12826   memcpy (d.perm, sel, nelt);
12827
12828   /* Calculate whether all elements are in one vector.  */
12829   for (i = which = 0; i < nelt; ++i)
12830     {
12831       unsigned char e = d.perm[i];
12832       gcc_assert (e < 2 * nelt);
12833       which |= (e < nelt ? 1 : 2);
12834     }
12835
12836   /* If all elements are from the second vector, reindex as if from the
12837      first vector.  */
12838   if (which == 2)
12839     for (i = 0; i < nelt; ++i)
12840       d.perm[i] -= nelt;
12841
12842   /* Check whether the mask can be applied to a single vector.  */
12843   d.one_vector_p = (which != 3);
12844
12845   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
12846   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
12847   if (!d.one_vector_p)
12848     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
12849
12850   start_sequence ();
12851   ret = aarch64_expand_vec_perm_const_1 (&d);
12852   end_sequence ();
12853
12854   return ret;
12855 }
12856
12857 rtx
12858 aarch64_reverse_mask (enum machine_mode mode)
12859 {
12860   /* We have to reverse each vector because we dont have
12861      a permuted load that can reverse-load according to ABI rules.  */
12862   rtx mask;
12863   rtvec v = rtvec_alloc (16);
12864   int i, j;
12865   int nunits = GET_MODE_NUNITS (mode);
12866   int usize = GET_MODE_UNIT_SIZE (mode);
12867
12868   gcc_assert (BYTES_BIG_ENDIAN);
12869   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
12870
12871   for (i = 0; i < nunits; i++)
12872     for (j = 0; j < usize; j++)
12873       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
12874   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
12875   return force_reg (V16QImode, mask);
12876 }
12877
12878 /* Implement MODES_TIEABLE_P.  In principle we should always return true.
12879    However due to issues with register allocation it is preferable to avoid
12880    tieing integer scalar and FP scalar modes.  Executing integer operations
12881    in general registers is better than treating them as scalar vector
12882    operations.  This reduces latency and avoids redundant int<->FP moves.
12883    So tie modes if they are either the same class, or vector modes with
12884    other vector modes, vector structs or any scalar mode.
12885 */
12886
12887 bool
12888 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
12889 {
12890   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
12891     return true;
12892
12893   /* We specifically want to allow elements of "structure" modes to
12894      be tieable to the structure.  This more general condition allows
12895      other rarer situations too.  */
12896   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
12897     return true;
12898
12899   /* Also allow any scalar modes with vectors.  */
12900   if (aarch64_vector_mode_supported_p (mode1)
12901       || aarch64_vector_mode_supported_p (mode2))
12902     return true;
12903
12904   return false;
12905 }
12906
12907 /* Return a new RTX holding the result of moving POINTER forward by
12908    AMOUNT bytes.  */
12909
12910 static rtx
12911 aarch64_move_pointer (rtx pointer, int amount)
12912 {
12913   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
12914
12915   return adjust_automodify_address (pointer, GET_MODE (pointer),
12916                                     next, amount);
12917 }
12918
12919 /* Return a new RTX holding the result of moving POINTER forward by the
12920    size of the mode it points to.  */
12921
12922 static rtx
12923 aarch64_progress_pointer (rtx pointer)
12924 {
12925   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
12926
12927   return aarch64_move_pointer (pointer, amount);
12928 }
12929
12930 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
12931    MODE bytes.  */
12932
12933 static void
12934 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
12935                                               machine_mode mode)
12936 {
12937   rtx reg = gen_reg_rtx (mode);
12938
12939   /* "Cast" the pointers to the correct mode.  */
12940   *src = adjust_address (*src, mode, 0);
12941   *dst = adjust_address (*dst, mode, 0);
12942   /* Emit the memcpy.  */
12943   emit_move_insn (reg, *src);
12944   emit_move_insn (*dst, reg);
12945   /* Move the pointers forward.  */
12946   *src = aarch64_progress_pointer (*src);
12947   *dst = aarch64_progress_pointer (*dst);
12948 }
12949
12950 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
12951    we succeed, otherwise return false.  */
12952
12953 bool
12954 aarch64_expand_movmem (rtx *operands)
12955 {
12956   unsigned int n;
12957   rtx dst = operands[0];
12958   rtx src = operands[1];
12959   rtx base;
12960   bool speed_p = !optimize_function_for_size_p (cfun);
12961
12962   /* When optimizing for size, give a better estimate of the length of a
12963      memcpy call, but use the default otherwise.  */
12964   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
12965
12966   /* We can't do anything smart if the amount to copy is not constant.  */
12967   if (!CONST_INT_P (operands[2]))
12968     return false;
12969
12970   n = UINTVAL (operands[2]);
12971
12972   /* Try to keep the number of instructions low.  For cases below 16 bytes we
12973      need to make at most two moves.  For cases above 16 bytes it will be one
12974      move for each 16 byte chunk, then at most two additional moves.  */
12975   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
12976     return false;
12977
12978   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
12979   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
12980
12981   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
12982   src = adjust_automodify_address (src, VOIDmode, base, 0);
12983
12984   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
12985      1-byte chunk.  */
12986   if (n < 4)
12987     {
12988       if (n >= 2)
12989         {
12990           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12991           n -= 2;
12992         }
12993
12994       if (n == 1)
12995         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
12996
12997       return true;
12998     }
12999
13000   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13001      4-byte chunk, partially overlapping with the previously copied chunk.  */
13002   if (n < 8)
13003     {
13004       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13005       n -= 4;
13006       if (n > 0)
13007         {
13008           int move = n - 4;
13009
13010           src = aarch64_move_pointer (src, move);
13011           dst = aarch64_move_pointer (dst, move);
13012           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13013         }
13014       return true;
13015     }
13016
13017   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13018      them, then (if applicable) an 8-byte chunk.  */
13019   while (n >= 8)
13020     {
13021       if (n / 16)
13022         {
13023           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13024           n -= 16;
13025         }
13026       else
13027         {
13028           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13029           n -= 8;
13030         }
13031     }
13032
13033   /* Finish the final bytes of the copy.  We can always do this in one
13034      instruction.  We either copy the exact amount we need, or partially
13035      overlap with the previous chunk we copied and copy 8-bytes.  */
13036   if (n == 0)
13037     return true;
13038   else if (n == 1)
13039     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13040   else if (n == 2)
13041     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13042   else if (n == 4)
13043     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13044   else
13045     {
13046       if (n == 3)
13047         {
13048           src = aarch64_move_pointer (src, -1);
13049           dst = aarch64_move_pointer (dst, -1);
13050           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13051         }
13052       else
13053         {
13054           int move = n - 8;
13055
13056           src = aarch64_move_pointer (src, move);
13057           dst = aarch64_move_pointer (dst, move);
13058           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13059         }
13060     }
13061
13062   return true;
13063 }
13064
13065 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
13066
13067 static unsigned HOST_WIDE_INT
13068 aarch64_asan_shadow_offset (void)
13069 {
13070   return (HOST_WIDE_INT_1 << 36);
13071 }
13072
13073 static bool
13074 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13075                                         unsigned int align,
13076                                         enum by_pieces_operation op,
13077                                         bool speed_p)
13078 {
13079   /* STORE_BY_PIECES can be used when copying a constant string, but
13080      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13081      For now we always fail this and let the move_by_pieces code copy
13082      the string from read-only memory.  */
13083   if (op == STORE_BY_PIECES)
13084     return false;
13085
13086   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13087 }
13088
13089 static rtx
13090 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
13091                         int code, tree treeop0, tree treeop1)
13092 {
13093   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13094   rtx op0, op1;
13095   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13096   insn_code icode;
13097   struct expand_operand ops[4];
13098
13099   start_sequence ();
13100   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13101
13102   op_mode = GET_MODE (op0);
13103   if (op_mode == VOIDmode)
13104     op_mode = GET_MODE (op1);
13105
13106   switch (op_mode)
13107     {
13108     case QImode:
13109     case HImode:
13110     case SImode:
13111       cmp_mode = SImode;
13112       icode = CODE_FOR_cmpsi;
13113       break;
13114
13115     case DImode:
13116       cmp_mode = DImode;
13117       icode = CODE_FOR_cmpdi;
13118       break;
13119
13120     case SFmode:
13121       cmp_mode = SFmode;
13122       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13123       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13124       break;
13125
13126     case DFmode:
13127       cmp_mode = DFmode;
13128       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13129       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13130       break;
13131
13132     default:
13133       end_sequence ();
13134       return NULL_RTX;
13135     }
13136
13137   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13138   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13139   if (!op0 || !op1)
13140     {
13141       end_sequence ();
13142       return NULL_RTX;
13143     }
13144   *prep_seq = get_insns ();
13145   end_sequence ();
13146
13147   create_fixed_operand (&ops[0], op0);
13148   create_fixed_operand (&ops[1], op1);
13149
13150   start_sequence ();
13151   if (!maybe_expand_insn (icode, 2, ops))
13152     {
13153       end_sequence ();
13154       return NULL_RTX;
13155     }
13156   *gen_seq = get_insns ();
13157   end_sequence ();
13158
13159   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13160                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13161 }
13162
13163 static rtx
13164 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
13165                        tree treeop0, tree treeop1, int bit_code)
13166 {
13167   rtx op0, op1, target;
13168   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13169   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13170   insn_code icode;
13171   struct expand_operand ops[6];
13172   int aarch64_cond;
13173
13174   push_to_sequence ((rtx_insn*) *prep_seq);
13175   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13176
13177   op_mode = GET_MODE (op0);
13178   if (op_mode == VOIDmode)
13179     op_mode = GET_MODE (op1);
13180
13181   switch (op_mode)
13182     {
13183     case QImode:
13184     case HImode:
13185     case SImode:
13186       cmp_mode = SImode;
13187       icode = CODE_FOR_ccmpsi;
13188       break;
13189
13190     case DImode:
13191       cmp_mode = DImode;
13192       icode = CODE_FOR_ccmpdi;
13193       break;
13194
13195     case SFmode:
13196       cmp_mode = SFmode;
13197       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13198       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13199       break;
13200
13201     case DFmode:
13202       cmp_mode = DFmode;
13203       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13204       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13205       break;
13206
13207     default:
13208       end_sequence ();
13209       return NULL_RTX;
13210     }
13211
13212   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13213   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13214   if (!op0 || !op1)
13215     {
13216       end_sequence ();
13217       return NULL_RTX;
13218     }
13219   *prep_seq = get_insns ();
13220   end_sequence ();
13221
13222   target = gen_rtx_REG (cc_mode, CC_REGNUM);
13223   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13224
13225   if (bit_code != AND)
13226     {
13227       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13228                                                 GET_MODE (XEXP (prev, 0))),
13229                              VOIDmode, XEXP (prev, 0), const0_rtx);
13230       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13231     }
13232
13233   create_fixed_operand (&ops[0], XEXP (prev, 0));
13234   create_fixed_operand (&ops[1], target);
13235   create_fixed_operand (&ops[2], op0);
13236   create_fixed_operand (&ops[3], op1);
13237   create_fixed_operand (&ops[4], prev);
13238   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13239
13240   push_to_sequence ((rtx_insn*) *gen_seq);
13241   if (!maybe_expand_insn (icode, 6, ops))
13242     {
13243       end_sequence ();
13244       return NULL_RTX;
13245     }
13246
13247   *gen_seq = get_insns ();
13248   end_sequence ();
13249
13250   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13251 }
13252
13253 #undef TARGET_GEN_CCMP_FIRST
13254 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13255
13256 #undef TARGET_GEN_CCMP_NEXT
13257 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13258
13259 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
13260    instruction fusion of some sort.  */
13261
13262 static bool
13263 aarch64_macro_fusion_p (void)
13264 {
13265   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13266 }
13267
13268
13269 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
13270    should be kept together during scheduling.  */
13271
13272 static bool
13273 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13274 {
13275   rtx set_dest;
13276   rtx prev_set = single_set (prev);
13277   rtx curr_set = single_set (curr);
13278   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
13279   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13280
13281   if (!aarch64_macro_fusion_p ())
13282     return false;
13283
13284   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
13285     {
13286       /* We are trying to match:
13287          prev (mov)  == (set (reg r0) (const_int imm16))
13288          curr (movk) == (set (zero_extract (reg r0)
13289                                            (const_int 16)
13290                                            (const_int 16))
13291                              (const_int imm16_1))  */
13292
13293       set_dest = SET_DEST (curr_set);
13294
13295       if (GET_CODE (set_dest) == ZERO_EXTRACT
13296           && CONST_INT_P (SET_SRC (curr_set))
13297           && CONST_INT_P (SET_SRC (prev_set))
13298           && CONST_INT_P (XEXP (set_dest, 2))
13299           && INTVAL (XEXP (set_dest, 2)) == 16
13300           && REG_P (XEXP (set_dest, 0))
13301           && REG_P (SET_DEST (prev_set))
13302           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13303         {
13304           return true;
13305         }
13306     }
13307
13308   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
13309     {
13310
13311       /*  We're trying to match:
13312           prev (adrp) == (set (reg r1)
13313                               (high (symbol_ref ("SYM"))))
13314           curr (add) == (set (reg r0)
13315                              (lo_sum (reg r1)
13316                                      (symbol_ref ("SYM"))))
13317           Note that r0 need not necessarily be the same as r1, especially
13318           during pre-regalloc scheduling.  */
13319
13320       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13321           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13322         {
13323           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13324               && REG_P (XEXP (SET_SRC (curr_set), 0))
13325               && REGNO (XEXP (SET_SRC (curr_set), 0))
13326                  == REGNO (SET_DEST (prev_set))
13327               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13328                               XEXP (SET_SRC (curr_set), 1)))
13329             return true;
13330         }
13331     }
13332
13333   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
13334     {
13335
13336       /* We're trying to match:
13337          prev (movk) == (set (zero_extract (reg r0)
13338                                            (const_int 16)
13339                                            (const_int 32))
13340                              (const_int imm16_1))
13341          curr (movk) == (set (zero_extract (reg r0)
13342                                            (const_int 16)
13343                                            (const_int 48))
13344                              (const_int imm16_2))  */
13345
13346       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13347           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13348           && REG_P (XEXP (SET_DEST (prev_set), 0))
13349           && REG_P (XEXP (SET_DEST (curr_set), 0))
13350           && REGNO (XEXP (SET_DEST (prev_set), 0))
13351              == REGNO (XEXP (SET_DEST (curr_set), 0))
13352           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13353           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13354           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13355           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13356           && CONST_INT_P (SET_SRC (prev_set))
13357           && CONST_INT_P (SET_SRC (curr_set)))
13358         return true;
13359
13360     }
13361   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
13362     {
13363       /* We're trying to match:
13364           prev (adrp) == (set (reg r0)
13365                               (high (symbol_ref ("SYM"))))
13366           curr (ldr) == (set (reg r1)
13367                              (mem (lo_sum (reg r0)
13368                                              (symbol_ref ("SYM")))))
13369                  or
13370           curr (ldr) == (set (reg r1)
13371                              (zero_extend (mem
13372                                            (lo_sum (reg r0)
13373                                                    (symbol_ref ("SYM"))))))  */
13374       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13375           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13376         {
13377           rtx curr_src = SET_SRC (curr_set);
13378
13379           if (GET_CODE (curr_src) == ZERO_EXTEND)
13380             curr_src = XEXP (curr_src, 0);
13381
13382           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13383               && REG_P (XEXP (XEXP (curr_src, 0), 0))
13384               && REGNO (XEXP (XEXP (curr_src, 0), 0))
13385                  == REGNO (SET_DEST (prev_set))
13386               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13387                               XEXP (SET_SRC (prev_set), 0)))
13388               return true;
13389         }
13390     }
13391
13392   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
13393        && aarch_crypto_can_dual_issue (prev, curr))
13394     return true;
13395
13396   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
13397       && any_condjump_p (curr))
13398     {
13399       enum attr_type prev_type = get_attr_type (prev);
13400
13401       /* FIXME: this misses some which is considered simple arthematic
13402          instructions for ThunderX.  Simple shifts are missed here.  */
13403       if (prev_type == TYPE_ALUS_SREG
13404           || prev_type == TYPE_ALUS_IMM
13405           || prev_type == TYPE_LOGICS_REG
13406           || prev_type == TYPE_LOGICS_IMM)
13407         return true;
13408     }
13409
13410   return false;
13411 }
13412
13413 /* Return true iff the instruction fusion described by OP is enabled.  */
13414
13415 bool
13416 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13417 {
13418   return (aarch64_tune_params.fusible_ops & op) != 0;
13419 }
13420
13421 /* If MEM is in the form of [base+offset], extract the two parts
13422    of address and set to BASE and OFFSET, otherwise return false
13423    after clearing BASE and OFFSET.  */
13424
13425 bool
13426 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13427 {
13428   rtx addr;
13429
13430   gcc_assert (MEM_P (mem));
13431
13432   addr = XEXP (mem, 0);
13433
13434   if (REG_P (addr))
13435     {
13436       *base = addr;
13437       *offset = const0_rtx;
13438       return true;
13439     }
13440
13441   if (GET_CODE (addr) == PLUS
13442       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13443     {
13444       *base = XEXP (addr, 0);
13445       *offset = XEXP (addr, 1);
13446       return true;
13447     }
13448
13449   *base = NULL_RTX;
13450   *offset = NULL_RTX;
13451
13452   return false;
13453 }
13454
13455 /* Types for scheduling fusion.  */
13456 enum sched_fusion_type
13457 {
13458   SCHED_FUSION_NONE = 0,
13459   SCHED_FUSION_LD_SIGN_EXTEND,
13460   SCHED_FUSION_LD_ZERO_EXTEND,
13461   SCHED_FUSION_LD,
13462   SCHED_FUSION_ST,
13463   SCHED_FUSION_NUM
13464 };
13465
13466 /* If INSN is a load or store of address in the form of [base+offset],
13467    extract the two parts and set to BASE and OFFSET.  Return scheduling
13468    fusion type this INSN is.  */
13469
13470 static enum sched_fusion_type
13471 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13472 {
13473   rtx x, dest, src;
13474   enum sched_fusion_type fusion = SCHED_FUSION_LD;
13475
13476   gcc_assert (INSN_P (insn));
13477   x = PATTERN (insn);
13478   if (GET_CODE (x) != SET)
13479     return SCHED_FUSION_NONE;
13480
13481   src = SET_SRC (x);
13482   dest = SET_DEST (x);
13483
13484   machine_mode dest_mode = GET_MODE (dest);
13485
13486   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13487     return SCHED_FUSION_NONE;
13488
13489   if (GET_CODE (src) == SIGN_EXTEND)
13490     {
13491       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
13492       src = XEXP (src, 0);
13493       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13494         return SCHED_FUSION_NONE;
13495     }
13496   else if (GET_CODE (src) == ZERO_EXTEND)
13497     {
13498       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13499       src = XEXP (src, 0);
13500       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13501         return SCHED_FUSION_NONE;
13502     }
13503
13504   if (GET_CODE (src) == MEM && REG_P (dest))
13505     extract_base_offset_in_addr (src, base, offset);
13506   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13507     {
13508       fusion = SCHED_FUSION_ST;
13509       extract_base_offset_in_addr (dest, base, offset);
13510     }
13511   else
13512     return SCHED_FUSION_NONE;
13513
13514   if (*base == NULL_RTX || *offset == NULL_RTX)
13515     fusion = SCHED_FUSION_NONE;
13516
13517   return fusion;
13518 }
13519
13520 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13521
13522    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13523    and PRI are only calculated for these instructions.  For other instruction,
13524    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
13525    type instruction fusion can be added by returning different priorities.
13526
13527    It's important that irrelevant instructions get the largest FUSION_PRI.  */
13528
13529 static void
13530 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13531                                int *fusion_pri, int *pri)
13532 {
13533   int tmp, off_val;
13534   rtx base, offset;
13535   enum sched_fusion_type fusion;
13536
13537   gcc_assert (INSN_P (insn));
13538
13539   tmp = max_pri - 1;
13540   fusion = fusion_load_store (insn, &base, &offset);
13541   if (fusion == SCHED_FUSION_NONE)
13542     {
13543       *pri = tmp;
13544       *fusion_pri = tmp;
13545       return;
13546     }
13547
13548   /* Set FUSION_PRI according to fusion type and base register.  */
13549   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
13550
13551   /* Calculate PRI.  */
13552   tmp /= 2;
13553
13554   /* INSN with smaller offset goes first.  */
13555   off_val = (int)(INTVAL (offset));
13556   if (off_val >= 0)
13557     tmp -= (off_val & 0xfffff);
13558   else
13559     tmp += ((- off_val) & 0xfffff);
13560
13561   *pri = tmp;
13562   return;
13563 }
13564
13565 /* Given OPERANDS of consecutive load/store, check if we can merge
13566    them into ldp/stp.  LOAD is true if they are load instructions.
13567    MODE is the mode of memory operands.  */
13568
13569 bool
13570 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
13571                                 enum machine_mode mode)
13572 {
13573   HOST_WIDE_INT offval_1, offval_2, msize;
13574   enum reg_class rclass_1, rclass_2;
13575   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
13576
13577   if (load)
13578     {
13579       mem_1 = operands[1];
13580       mem_2 = operands[3];
13581       reg_1 = operands[0];
13582       reg_2 = operands[2];
13583       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
13584       if (REGNO (reg_1) == REGNO (reg_2))
13585         return false;
13586     }
13587   else
13588     {
13589       mem_1 = operands[0];
13590       mem_2 = operands[2];
13591       reg_1 = operands[1];
13592       reg_2 = operands[3];
13593     }
13594
13595   /* The mems cannot be volatile.  */
13596   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
13597     return false;
13598
13599   /* Check if the addresses are in the form of [base+offset].  */
13600   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13601   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13602     return false;
13603   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13604   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13605     return false;
13606
13607   /* Check if the bases are same.  */
13608   if (!rtx_equal_p (base_1, base_2))
13609     return false;
13610
13611   offval_1 = INTVAL (offset_1);
13612   offval_2 = INTVAL (offset_2);
13613   msize = GET_MODE_SIZE (mode);
13614   /* Check if the offsets are consecutive.  */
13615   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
13616     return false;
13617
13618   /* Check if the addresses are clobbered by load.  */
13619   if (load)
13620     {
13621       if (reg_mentioned_p (reg_1, mem_1))
13622         return false;
13623
13624       /* In increasing order, the last load can clobber the address.  */
13625       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
13626       return false;
13627     }
13628
13629   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13630     rclass_1 = FP_REGS;
13631   else
13632     rclass_1 = GENERAL_REGS;
13633
13634   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13635     rclass_2 = FP_REGS;
13636   else
13637     rclass_2 = GENERAL_REGS;
13638
13639   /* Check if the registers are of same class.  */
13640   if (rclass_1 != rclass_2)
13641     return false;
13642
13643   return true;
13644 }
13645
13646 /* Given OPERANDS of consecutive load/store, check if we can merge
13647    them into ldp/stp by adjusting the offset.  LOAD is true if they
13648    are load instructions.  MODE is the mode of memory operands.
13649
13650    Given below consecutive stores:
13651
13652      str  w1, [xb, 0x100]
13653      str  w1, [xb, 0x104]
13654      str  w1, [xb, 0x108]
13655      str  w1, [xb, 0x10c]
13656
13657    Though the offsets are out of the range supported by stp, we can
13658    still pair them after adjusting the offset, like:
13659
13660      add  scratch, xb, 0x100
13661      stp  w1, w1, [scratch]
13662      stp  w1, w1, [scratch, 0x8]
13663
13664    The peephole patterns detecting this opportunity should guarantee
13665    the scratch register is avaliable.  */
13666
13667 bool
13668 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
13669                                        enum machine_mode mode)
13670 {
13671   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
13672   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
13673   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
13674   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
13675
13676   if (load)
13677     {
13678       reg_1 = operands[0];
13679       mem_1 = operands[1];
13680       reg_2 = operands[2];
13681       mem_2 = operands[3];
13682       reg_3 = operands[4];
13683       mem_3 = operands[5];
13684       reg_4 = operands[6];
13685       mem_4 = operands[7];
13686       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
13687                   && REG_P (reg_3) && REG_P (reg_4));
13688       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
13689         return false;
13690     }
13691   else
13692     {
13693       mem_1 = operands[0];
13694       reg_1 = operands[1];
13695       mem_2 = operands[2];
13696       reg_2 = operands[3];
13697       mem_3 = operands[4];
13698       reg_3 = operands[5];
13699       mem_4 = operands[6];
13700       reg_4 = operands[7];
13701     }
13702   /* Skip if memory operand is by itslef valid for ldp/stp.  */
13703   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
13704     return false;
13705
13706   /* The mems cannot be volatile.  */
13707   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
13708       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
13709     return false;
13710
13711   /* Check if the addresses are in the form of [base+offset].  */
13712   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13713   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13714     return false;
13715   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13716   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13717     return false;
13718   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
13719   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
13720     return false;
13721   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
13722   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
13723     return false;
13724
13725   /* Check if the bases are same.  */
13726   if (!rtx_equal_p (base_1, base_2)
13727       || !rtx_equal_p (base_2, base_3)
13728       || !rtx_equal_p (base_3, base_4))
13729     return false;
13730
13731   offval_1 = INTVAL (offset_1);
13732   offval_2 = INTVAL (offset_2);
13733   offval_3 = INTVAL (offset_3);
13734   offval_4 = INTVAL (offset_4);
13735   msize = GET_MODE_SIZE (mode);
13736   /* Check if the offsets are consecutive.  */
13737   if ((offval_1 != (offval_2 + msize)
13738        || offval_1 != (offval_3 + msize * 2)
13739        || offval_1 != (offval_4 + msize * 3))
13740       && (offval_4 != (offval_3 + msize)
13741           || offval_4 != (offval_2 + msize * 2)
13742           || offval_4 != (offval_1 + msize * 3)))
13743     return false;
13744
13745   /* Check if the addresses are clobbered by load.  */
13746   if (load)
13747     {
13748       if (reg_mentioned_p (reg_1, mem_1)
13749           || reg_mentioned_p (reg_2, mem_2)
13750           || reg_mentioned_p (reg_3, mem_3))
13751         return false;
13752
13753       /* In increasing order, the last load can clobber the address.  */
13754       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
13755         return false;
13756     }
13757
13758   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13759     rclass_1 = FP_REGS;
13760   else
13761     rclass_1 = GENERAL_REGS;
13762
13763   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13764     rclass_2 = FP_REGS;
13765   else
13766     rclass_2 = GENERAL_REGS;
13767
13768   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
13769     rclass_3 = FP_REGS;
13770   else
13771     rclass_3 = GENERAL_REGS;
13772
13773   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
13774     rclass_4 = FP_REGS;
13775   else
13776     rclass_4 = GENERAL_REGS;
13777
13778   /* Check if the registers are of same class.  */
13779   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
13780     return false;
13781
13782   return true;
13783 }
13784
13785 /* Given OPERANDS of consecutive load/store, this function pairs them
13786    into ldp/stp after adjusting the offset.  It depends on the fact
13787    that addresses of load/store instructions are in increasing order.
13788    MODE is the mode of memory operands.  CODE is the rtl operator
13789    which should be applied to all memory operands, it's SIGN_EXTEND,
13790    ZERO_EXTEND or UNKNOWN.  */
13791
13792 bool
13793 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
13794                              enum machine_mode mode, RTX_CODE code)
13795 {
13796   rtx base, offset, t1, t2;
13797   rtx mem_1, mem_2, mem_3, mem_4;
13798   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
13799
13800   if (load)
13801     {
13802       mem_1 = operands[1];
13803       mem_2 = operands[3];
13804       mem_3 = operands[5];
13805       mem_4 = operands[7];
13806     }
13807   else
13808     {
13809       mem_1 = operands[0];
13810       mem_2 = operands[2];
13811       mem_3 = operands[4];
13812       mem_4 = operands[6];
13813       gcc_assert (code == UNKNOWN);
13814     }
13815
13816   extract_base_offset_in_addr (mem_1, &base, &offset);
13817   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
13818
13819   /* Adjust offset thus it can fit in ldp/stp instruction.  */
13820   msize = GET_MODE_SIZE (mode);
13821   stp_off_limit = msize * 0x40;
13822   off_val = INTVAL (offset);
13823   abs_off = (off_val < 0) ? -off_val : off_val;
13824   new_off = abs_off % stp_off_limit;
13825   adj_off = abs_off - new_off;
13826
13827   /* Further adjust to make sure all offsets are OK.  */
13828   if ((new_off + msize * 2) >= stp_off_limit)
13829     {
13830       adj_off += stp_off_limit;
13831       new_off -= stp_off_limit;
13832     }
13833
13834   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
13835   if (adj_off >= 0x1000)
13836     return false;
13837
13838   if (off_val < 0)
13839     {
13840       adj_off = -adj_off;
13841       new_off = -new_off;
13842     }
13843
13844   /* Create new memory references.  */
13845   mem_1 = change_address (mem_1, VOIDmode,
13846                           plus_constant (DImode, operands[8], new_off));
13847
13848   /* Check if the adjusted address is OK for ldp/stp.  */
13849   if (!aarch64_mem_pair_operand (mem_1, mode))
13850     return false;
13851
13852   msize = GET_MODE_SIZE (mode);
13853   mem_2 = change_address (mem_2, VOIDmode,
13854                           plus_constant (DImode,
13855                                          operands[8],
13856                                          new_off + msize));
13857   mem_3 = change_address (mem_3, VOIDmode,
13858                           plus_constant (DImode,
13859                                          operands[8],
13860                                          new_off + msize * 2));
13861   mem_4 = change_address (mem_4, VOIDmode,
13862                           plus_constant (DImode,
13863                                          operands[8],
13864                                          new_off + msize * 3));
13865
13866   if (code == ZERO_EXTEND)
13867     {
13868       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
13869       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
13870       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
13871       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
13872     }
13873   else if (code == SIGN_EXTEND)
13874     {
13875       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
13876       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
13877       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
13878       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
13879     }
13880
13881   if (load)
13882     {
13883       operands[1] = mem_1;
13884       operands[3] = mem_2;
13885       operands[5] = mem_3;
13886       operands[7] = mem_4;
13887     }
13888   else
13889     {
13890       operands[0] = mem_1;
13891       operands[2] = mem_2;
13892       operands[4] = mem_3;
13893       operands[6] = mem_4;
13894     }
13895
13896   /* Emit adjusting instruction.  */
13897   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
13898   /* Emit ldp/stp instructions.  */
13899   t1 = gen_rtx_SET (operands[0], operands[1]);
13900   t2 = gen_rtx_SET (operands[2], operands[3]);
13901   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13902   t1 = gen_rtx_SET (operands[4], operands[5]);
13903   t2 = gen_rtx_SET (operands[6], operands[7]);
13904   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13905   return true;
13906 }
13907
13908 /* Return 1 if pseudo register should be created and used to hold
13909    GOT address for PIC code.  */
13910
13911 bool
13912 aarch64_use_pseudo_pic_reg (void)
13913 {
13914   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
13915 }
13916
13917 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
13918
13919 static int
13920 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
13921 {
13922   switch (XINT (x, 1))
13923     {
13924     case UNSPEC_GOTSMALLPIC:
13925     case UNSPEC_GOTSMALLPIC28K:
13926     case UNSPEC_GOTTINYPIC:
13927       return 0;
13928     default:
13929       break;
13930     }
13931
13932   return default_unspec_may_trap_p (x, flags);
13933 }
13934
13935
13936 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
13937    return the log2 of that value.  Otherwise return -1.  */
13938
13939 int
13940 aarch64_fpconst_pow_of_2 (rtx x)
13941 {
13942   const REAL_VALUE_TYPE *r;
13943
13944   if (!CONST_DOUBLE_P (x))
13945     return -1;
13946
13947   r = CONST_DOUBLE_REAL_VALUE (x);
13948
13949   if (REAL_VALUE_NEGATIVE (*r)
13950       || REAL_VALUE_ISNAN (*r)
13951       || REAL_VALUE_ISINF (*r)
13952       || !real_isinteger (r, DFmode))
13953     return -1;
13954
13955   return exact_log2 (real_to_integer (r));
13956 }
13957
13958 /* If X is a vector of equal CONST_DOUBLE values and that value is
13959    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
13960
13961 int
13962 aarch64_vec_fpconst_pow_of_2 (rtx x)
13963 {
13964   if (GET_CODE (x) != CONST_VECTOR)
13965     return -1;
13966
13967   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
13968     return -1;
13969
13970   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
13971   if (firstval <= 0)
13972     return -1;
13973
13974   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
13975     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
13976       return -1;
13977
13978   return firstval;
13979 }
13980
13981 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float.  */
13982 static tree
13983 aarch64_promoted_type (const_tree t)
13984 {
13985   if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
13986     return float_type_node;
13987   return NULL_TREE;
13988 }
13989
13990 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
13991
13992 static bool
13993 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
13994                            optimization_type opt_type)
13995 {
13996   switch (op)
13997     {
13998     case rsqrt_optab:
13999       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14000
14001     default:
14002       return true;
14003     }
14004 }
14005
14006 #undef TARGET_ADDRESS_COST
14007 #define TARGET_ADDRESS_COST aarch64_address_cost
14008
14009 /* This hook will determines whether unnamed bitfields affect the alignment
14010    of the containing structure.  The hook returns true if the structure
14011    should inherit the alignment requirements of an unnamed bitfield's
14012    type.  */
14013 #undef TARGET_ALIGN_ANON_BITFIELD
14014 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14015
14016 #undef TARGET_ASM_ALIGNED_DI_OP
14017 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14018
14019 #undef TARGET_ASM_ALIGNED_HI_OP
14020 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14021
14022 #undef TARGET_ASM_ALIGNED_SI_OP
14023 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14024
14025 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14026 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14027   hook_bool_const_tree_hwi_hwi_const_tree_true
14028
14029 #undef TARGET_ASM_FILE_START
14030 #define TARGET_ASM_FILE_START aarch64_start_file
14031
14032 #undef TARGET_ASM_OUTPUT_MI_THUNK
14033 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14034
14035 #undef TARGET_ASM_SELECT_RTX_SECTION
14036 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14037
14038 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14039 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14040
14041 #undef TARGET_BUILD_BUILTIN_VA_LIST
14042 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14043
14044 #undef TARGET_CALLEE_COPIES
14045 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14046
14047 #undef TARGET_CAN_ELIMINATE
14048 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14049
14050 #undef TARGET_CAN_INLINE_P
14051 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14052
14053 #undef TARGET_CANNOT_FORCE_CONST_MEM
14054 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14055
14056 #undef TARGET_CASE_VALUES_THRESHOLD
14057 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14058
14059 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14060 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14061
14062 /* Only the least significant bit is used for initialization guard
14063    variables.  */
14064 #undef TARGET_CXX_GUARD_MASK_BIT
14065 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14066
14067 #undef TARGET_C_MODE_FOR_SUFFIX
14068 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14069
14070 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14071 #undef  TARGET_DEFAULT_TARGET_FLAGS
14072 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14073 #endif
14074
14075 #undef TARGET_CLASS_MAX_NREGS
14076 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14077
14078 #undef TARGET_BUILTIN_DECL
14079 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14080
14081 #undef TARGET_BUILTIN_RECIPROCAL
14082 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14083
14084 #undef  TARGET_EXPAND_BUILTIN
14085 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14086
14087 #undef TARGET_EXPAND_BUILTIN_VA_START
14088 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14089
14090 #undef TARGET_FOLD_BUILTIN
14091 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14092
14093 #undef TARGET_FUNCTION_ARG
14094 #define TARGET_FUNCTION_ARG aarch64_function_arg
14095
14096 #undef TARGET_FUNCTION_ARG_ADVANCE
14097 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14098
14099 #undef TARGET_FUNCTION_ARG_BOUNDARY
14100 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14101
14102 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14103 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14104
14105 #undef TARGET_FUNCTION_VALUE
14106 #define TARGET_FUNCTION_VALUE aarch64_function_value
14107
14108 #undef TARGET_FUNCTION_VALUE_REGNO_P
14109 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14110
14111 #undef TARGET_FRAME_POINTER_REQUIRED
14112 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14113
14114 #undef TARGET_GIMPLE_FOLD_BUILTIN
14115 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14116
14117 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14118 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14119
14120 #undef  TARGET_INIT_BUILTINS
14121 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
14122
14123 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14124 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14125   aarch64_ira_change_pseudo_allocno_class
14126
14127 #undef TARGET_LEGITIMATE_ADDRESS_P
14128 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14129
14130 #undef TARGET_LEGITIMATE_CONSTANT_P
14131 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14132
14133 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14134 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14135
14136 #undef TARGET_LRA_P
14137 #define TARGET_LRA_P hook_bool_void_true
14138
14139 #undef TARGET_MANGLE_TYPE
14140 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14141
14142 #undef TARGET_MEMORY_MOVE_COST
14143 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14144
14145 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14146 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14147
14148 #undef TARGET_MUST_PASS_IN_STACK
14149 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14150
14151 /* This target hook should return true if accesses to volatile bitfields
14152    should use the narrowest mode possible.  It should return false if these
14153    accesses should use the bitfield container type.  */
14154 #undef TARGET_NARROW_VOLATILE_BITFIELD
14155 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14156
14157 #undef  TARGET_OPTION_OVERRIDE
14158 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14159
14160 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14161 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14162   aarch64_override_options_after_change
14163
14164 #undef TARGET_OPTION_SAVE
14165 #define TARGET_OPTION_SAVE aarch64_option_save
14166
14167 #undef TARGET_OPTION_RESTORE
14168 #define TARGET_OPTION_RESTORE aarch64_option_restore
14169
14170 #undef TARGET_OPTION_PRINT
14171 #define TARGET_OPTION_PRINT aarch64_option_print
14172
14173 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14174 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14175
14176 #undef TARGET_SET_CURRENT_FUNCTION
14177 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14178
14179 #undef TARGET_PASS_BY_REFERENCE
14180 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14181
14182 #undef TARGET_PREFERRED_RELOAD_CLASS
14183 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14184
14185 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14186 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14187
14188 #undef TARGET_PROMOTED_TYPE
14189 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14190
14191 #undef TARGET_SECONDARY_RELOAD
14192 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14193
14194 #undef TARGET_SHIFT_TRUNCATION_MASK
14195 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14196
14197 #undef TARGET_SETUP_INCOMING_VARARGS
14198 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14199
14200 #undef TARGET_STRUCT_VALUE_RTX
14201 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
14202
14203 #undef TARGET_REGISTER_MOVE_COST
14204 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14205
14206 #undef TARGET_RETURN_IN_MEMORY
14207 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14208
14209 #undef TARGET_RETURN_IN_MSB
14210 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14211
14212 #undef TARGET_RTX_COSTS
14213 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14214
14215 #undef TARGET_SCHED_ISSUE_RATE
14216 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14217
14218 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14219 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14220   aarch64_sched_first_cycle_multipass_dfa_lookahead
14221
14222 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14223 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14224   aarch64_first_cycle_multipass_dfa_lookahead_guard
14225
14226 #undef TARGET_TRAMPOLINE_INIT
14227 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14228
14229 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14230 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14231
14232 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14233 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14234
14235 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14236 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14237
14238 #undef TARGET_VECTORIZE_ADD_STMT_COST
14239 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14240
14241 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14242 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14243   aarch64_builtin_vectorization_cost
14244
14245 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14246 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14247
14248 #undef TARGET_VECTORIZE_BUILTINS
14249 #define TARGET_VECTORIZE_BUILTINS
14250
14251 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14252 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14253   aarch64_builtin_vectorized_function
14254
14255 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14256 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14257   aarch64_autovectorize_vector_sizes
14258
14259 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14260 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14261   aarch64_atomic_assign_expand_fenv
14262
14263 /* Section anchor support.  */
14264
14265 #undef TARGET_MIN_ANCHOR_OFFSET
14266 #define TARGET_MIN_ANCHOR_OFFSET -256
14267
14268 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14269    byte offset; we can do much more for larger data types, but have no way
14270    to determine the size of the access.  We assume accesses are aligned.  */
14271 #undef TARGET_MAX_ANCHOR_OFFSET
14272 #define TARGET_MAX_ANCHOR_OFFSET 4095
14273
14274 #undef TARGET_VECTOR_ALIGNMENT
14275 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14276
14277 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14278 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14279   aarch64_simd_vector_alignment_reachable
14280
14281 /* vec_perm support.  */
14282
14283 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14284 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14285   aarch64_vectorize_vec_perm_const_ok
14286
14287 #undef TARGET_INIT_LIBFUNCS
14288 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14289
14290 #undef TARGET_FIXED_CONDITION_CODE_REGS
14291 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14292
14293 #undef TARGET_FLAGS_REGNUM
14294 #define TARGET_FLAGS_REGNUM CC_REGNUM
14295
14296 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14297 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14298
14299 #undef TARGET_ASAN_SHADOW_OFFSET
14300 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14301
14302 #undef TARGET_LEGITIMIZE_ADDRESS
14303 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14304
14305 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14306 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14307   aarch64_use_by_pieces_infrastructure_p
14308
14309 #undef TARGET_CAN_USE_DOLOOP_P
14310 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14311
14312 #undef TARGET_SCHED_MACRO_FUSION_P
14313 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14314
14315 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14316 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14317
14318 #undef TARGET_SCHED_FUSION_PRIORITY
14319 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14320
14321 #undef TARGET_UNSPEC_MAY_TRAP_P
14322 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14323
14324 #undef TARGET_USE_PSEUDO_PIC_REG
14325 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14326
14327 #undef TARGET_PRINT_OPERAND
14328 #define TARGET_PRINT_OPERAND aarch64_print_operand
14329
14330 #undef TARGET_PRINT_OPERAND_ADDRESS
14331 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14332
14333 #undef TARGET_OPTAB_SUPPORTED_P
14334 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14335
14336 #undef TARGET_OMIT_STRUCT_RETURN_REG
14337 #define TARGET_OMIT_STRUCT_RETURN_REG true
14338
14339 struct gcc_target targetm = TARGET_INITIALIZER;
14340
14341 #include "gt-aarch64.h"