gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2016 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "cfgloop.h"
  32 #include "df.h"
  33 #include "tm_p.h"
  34 #include "stringpool.h"
  35 #include "optabs.h"
  36 #include "regs.h"
  37 #include "emit-rtl.h"
  38 #include "recog.h"
  39 #include "diagnostic.h"
  40 #include "insn-attr.h"
  41 #include "alias.h"
  42 #include "fold-const.h"
  43 #include "stor-layout.h"
  44 #include "calls.h"
  45 #include "varasm.h"
  46 #include "output.h"
  47 #include "flags.h"
  48 #include "explow.h"
  49 #include "expr.h"
  50 #include "reload.h"
  51 #include "langhooks.h"
  52 #include "opts.h"
  53 #include "params.h"
  54 #include "gimplify.h"
  55 #include "dwarf2.h"
  56 #include "gimple-iterator.h"
  57 #include "tree-vectorizer.h"
  58 #include "aarch64-cost-tables.h"
  59 #include "dumpfile.h"
  60 #include "builtins.h"
  61 #include "rtl-iter.h"
  62 #include "tm-constrs.h"
  63 #include "sched-int.h"
  64 #include "cortex-a57-fma-steering.h"
  65 #include "target-globals.h"
  66 #include "common/common-target.h"
  67
  68 /* This file should be included last.  */
  69 #include "target-def.h"
  70
  71 /* Defined for convenience.  */
  72 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  73
  74 /* Classifies an address.
  75
  76    ADDRESS_REG_IMM
  77        A simple base register plus immediate offset.
  78
  79    ADDRESS_REG_WB
  80        A base register indexed by immediate offset with writeback.
  81
  82    ADDRESS_REG_REG
  83        A base register indexed by (optionally scaled) register.
  84
  85    ADDRESS_REG_UXTW
  86        A base register indexed by (optionally scaled) zero-extended register.
  87
  88    ADDRESS_REG_SXTW
  89        A base register indexed by (optionally scaled) sign-extended register.
  90
  91    ADDRESS_LO_SUM
  92        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  93
  94    ADDRESS_SYMBOLIC:
  95        A constant symbolic address, in pc-relative literal pool.  */
  96
  97 enum aarch64_address_type {
  98   ADDRESS_REG_IMM,
  99   ADDRESS_REG_WB,
 100   ADDRESS_REG_REG,
 101   ADDRESS_REG_UXTW,
 102   ADDRESS_REG_SXTW,
 103   ADDRESS_LO_SUM,
 104   ADDRESS_SYMBOLIC
 105 };
 106
 107 struct aarch64_address_info {
 108   enum aarch64_address_type type;
 109   rtx base;
 110   rtx offset;
 111   int shift;
 112   enum aarch64_symbol_type symbol_type;
 113 };
 114
 115 struct simd_immediate_info
 116 {
 117   rtx value;
 118   int shift;
 119   int element_width;
 120   bool mvn;
 121   bool msl;
 122 };
 123
 124 /* The current code model.  */
 125 enum aarch64_code_model aarch64_cmodel;
 126
 127 #ifdef HAVE_AS_TLS
 128 #undef TARGET_HAVE_TLS
 129 #define TARGET_HAVE_TLS 1
 130 #endif
 131
 132 static bool aarch64_composite_type_p (const_tree, machine_mode);
 133 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 134                                                      const_tree,
 135                                                      machine_mode *, int *,
 136                                                      bool *);
 137 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 138 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 139 static void aarch64_override_options_after_change (void);
 140 static bool aarch64_vector_mode_supported_p (machine_mode);
 141 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 142                                                  const unsigned char *sel);
 143 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 144
 145 /* Major revision number of the ARM Architecture implemented by the target.  */
 146 unsigned aarch64_architecture_version;
 147
 148 /* The processor for which instructions should be scheduled.  */
 149 enum aarch64_processor aarch64_tune = cortexa53;
 150
 151 /* Mask to specify which instruction scheduling options should be used.  */
 152 unsigned long aarch64_tune_flags = 0;
 153
 154 /* Global flag for PC relative loads.  */
 155 bool aarch64_pcrelative_literal_loads;
 156
 157 /* Support for command line parsing of boolean flags in the tuning
 158    structures.  */
 159 struct aarch64_flag_desc
 160 {
 161   const char* name;
 162   unsigned int flag;
 163 };
 164
 165 #define AARCH64_FUSION_PAIR(name, internal_name) \
 166   { name, AARCH64_FUSE_##internal_name },
 167 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 168 {
 169   { "none", AARCH64_FUSE_NOTHING },
 170 #include "aarch64-fusion-pairs.def"
 171   { "all", AARCH64_FUSE_ALL },
 172   { NULL, AARCH64_FUSE_NOTHING }
 173 };
 174 #undef AARCH64_FUION_PAIR
 175
 176 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 177   { name, AARCH64_EXTRA_TUNE_##internal_name },
 178 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 179 {
 180   { "none", AARCH64_EXTRA_TUNE_NONE },
 181 #include "aarch64-tuning-flags.def"
 182   { "all", AARCH64_EXTRA_TUNE_ALL },
 183   { NULL, AARCH64_EXTRA_TUNE_NONE }
 184 };
 185 #undef AARCH64_EXTRA_TUNING_OPTION
 186
 187 /* Tuning parameters.  */
 188
 189 static const struct cpu_addrcost_table generic_addrcost_table =
 190 {
 191     {
 192       0, /* hi  */
 193       0, /* si  */
 194       0, /* di  */
 195       0, /* ti  */
 196     },
 197   0, /* pre_modify  */
 198   0, /* post_modify  */
 199   0, /* register_offset  */
 200   0, /* register_sextend  */
 201   0, /* register_zextend  */
 202   0 /* imm_offset  */
 203 };
 204
 205 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 206 {
 207     {
 208       1, /* hi  */
 209       0, /* si  */
 210       0, /* di  */
 211       1, /* ti  */
 212     },
 213   0, /* pre_modify  */
 214   0, /* post_modify  */
 215   0, /* register_offset  */
 216   0, /* register_sextend  */
 217   0, /* register_zextend  */
 218   0, /* imm_offset  */
 219 };
 220
 221 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 222 {
 223     {
 224       0, /* hi  */
 225       0, /* si  */
 226       0, /* di  */
 227       2, /* ti  */
 228     },
 229   0, /* pre_modify  */
 230   0, /* post_modify  */
 231   1, /* register_offset  */
 232   1, /* register_sextend  */
 233   2, /* register_zextend  */
 234   0, /* imm_offset  */
 235 };
 236
 237 static const struct cpu_addrcost_table xgene1_addrcost_table =
 238 {
 239     {
 240       1, /* hi  */
 241       0, /* si  */
 242       0, /* di  */
 243       1, /* ti  */
 244     },
 245   1, /* pre_modify  */
 246   0, /* post_modify  */
 247   0, /* register_offset  */
 248   1, /* register_sextend  */
 249   1, /* register_zextend  */
 250   0, /* imm_offset  */
 251 };
 252
 253 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 254 {
 255     {
 256       1, /* hi  */
 257       0, /* si  */
 258       0, /* di  */
 259       1, /* ti  */
 260     },
 261   0, /* pre_modify  */
 262   0, /* post_modify  */
 263   0, /* register_offset  */
 264   0, /* register_sextend  */
 265   0, /* register_zextend  */
 266   0 /* imm_offset  */
 267 };
 268
 269 static const struct cpu_addrcost_table vulcan_addrcost_table =
 270 {
 271     {
 272       0, /* hi  */
 273       0, /* si  */
 274       0, /* di  */
 275       2, /* ti  */
 276     },
 277   0, /* pre_modify  */
 278   0, /* post_modify  */
 279   2, /* register_offset  */
 280   3, /* register_sextend  */
 281   3, /* register_zextend  */
 282   0, /* imm_offset  */
 283 };
 284
 285 static const struct cpu_regmove_cost generic_regmove_cost =
 286 {
 287   1, /* GP2GP  */
 288   /* Avoid the use of slow int<->fp moves for spilling by setting
 289      their cost higher than memmov_cost.  */
 290   5, /* GP2FP  */
 291   5, /* FP2GP  */
 292   2 /* FP2FP  */
 293 };
 294
 295 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 296 {
 297   1, /* GP2GP  */
 298   /* Avoid the use of slow int<->fp moves for spilling by setting
 299      their cost higher than memmov_cost.  */
 300   5, /* GP2FP  */
 301   5, /* FP2GP  */
 302   2 /* FP2FP  */
 303 };
 304
 305 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 306 {
 307   1, /* GP2GP  */
 308   /* Avoid the use of slow int<->fp moves for spilling by setting
 309      their cost higher than memmov_cost.  */
 310   5, /* GP2FP  */
 311   5, /* FP2GP  */
 312   2 /* FP2FP  */
 313 };
 314
 315 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 316 {
 317   1, /* GP2GP  */
 318   /* Avoid the use of slow int<->fp moves for spilling by setting
 319      their cost higher than memmov_cost (actual, 4 and 9).  */
 320   9, /* GP2FP  */
 321   9, /* FP2GP  */
 322   1 /* FP2FP  */
 323 };
 324
 325 static const struct cpu_regmove_cost thunderx_regmove_cost =
 326 {
 327   2, /* GP2GP  */
 328   2, /* GP2FP  */
 329   6, /* FP2GP  */
 330   4 /* FP2FP  */
 331 };
 332
 333 static const struct cpu_regmove_cost xgene1_regmove_cost =
 334 {
 335   1, /* GP2GP  */
 336   /* Avoid the use of slow int<->fp moves for spilling by setting
 337      their cost higher than memmov_cost.  */
 338   8, /* GP2FP  */
 339   8, /* FP2GP  */
 340   2 /* FP2FP  */
 341 };
 342
 343 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 344 {
 345   2, /* GP2GP  */
 346   /* Avoid the use of int<->fp moves for spilling.  */
 347   6, /* GP2FP  */
 348   6, /* FP2GP  */
 349   4 /* FP2FP  */
 350 };
 351
 352 static const struct cpu_regmove_cost vulcan_regmove_cost =
 353 {
 354   1, /* GP2GP  */
 355   /* Avoid the use of int<->fp moves for spilling.  */
 356   8, /* GP2FP  */
 357   8, /* FP2GP  */
 358   4  /* FP2FP  */
 359 };
 360
 361 /* Generic costs for vector insn classes.  */
 362 static const struct cpu_vector_cost generic_vector_cost =
 363 {
 364   1, /* scalar_stmt_cost  */
 365   1, /* scalar_load_cost  */
 366   1, /* scalar_store_cost  */
 367   1, /* vec_stmt_cost  */
 368   2, /* vec_permute_cost  */
 369   1, /* vec_to_scalar_cost  */
 370   1, /* scalar_to_vec_cost  */
 371   1, /* vec_align_load_cost  */
 372   1, /* vec_unalign_load_cost  */
 373   1, /* vec_unalign_store_cost  */
 374   1, /* vec_store_cost  */
 375   3, /* cond_taken_branch_cost  */
 376   1 /* cond_not_taken_branch_cost  */
 377 };
 378
 379 /* ThunderX costs for vector insn classes.  */
 380 static const struct cpu_vector_cost thunderx_vector_cost =
 381 {
 382   1, /* scalar_stmt_cost  */
 383   3, /* scalar_load_cost  */
 384   1, /* scalar_store_cost  */
 385   4, /* vec_stmt_cost  */
 386   4, /* vec_permute_cost  */
 387   2, /* vec_to_scalar_cost  */
 388   2, /* scalar_to_vec_cost  */
 389   3, /* vec_align_load_cost  */
 390   10, /* vec_unalign_load_cost  */
 391   10, /* vec_unalign_store_cost  */
 392   1, /* vec_store_cost  */
 393   3, /* cond_taken_branch_cost  */
 394   3 /* cond_not_taken_branch_cost  */
 395 };
 396
 397 /* Generic costs for vector insn classes.  */
 398 static const struct cpu_vector_cost cortexa57_vector_cost =
 399 {
 400   1, /* scalar_stmt_cost  */
 401   4, /* scalar_load_cost  */
 402   1, /* scalar_store_cost  */
 403   3, /* vec_stmt_cost  */
 404   3, /* vec_permute_cost  */
 405   8, /* vec_to_scalar_cost  */
 406   8, /* scalar_to_vec_cost  */
 407   5, /* vec_align_load_cost  */
 408   5, /* vec_unalign_load_cost  */
 409   1, /* vec_unalign_store_cost  */
 410   1, /* vec_store_cost  */
 411   1, /* cond_taken_branch_cost  */
 412   1 /* cond_not_taken_branch_cost  */
 413 };
 414
 415 static const struct cpu_vector_cost exynosm1_vector_cost =
 416 {
 417   1, /* scalar_stmt_cost  */
 418   5, /* scalar_load_cost  */
 419   1, /* scalar_store_cost  */
 420   3, /* vec_stmt_cost  */
 421   3, /* vec_permute_cost  */
 422   3, /* vec_to_scalar_cost  */
 423   3, /* scalar_to_vec_cost  */
 424   5, /* vec_align_load_cost  */
 425   5, /* vec_unalign_load_cost  */
 426   1, /* vec_unalign_store_cost  */
 427   1, /* vec_store_cost  */
 428   1, /* cond_taken_branch_cost  */
 429   1 /* cond_not_taken_branch_cost  */
 430 };
 431
 432 /* Generic costs for vector insn classes.  */
 433 static const struct cpu_vector_cost xgene1_vector_cost =
 434 {
 435   1, /* scalar_stmt_cost  */
 436   5, /* scalar_load_cost  */
 437   1, /* scalar_store_cost  */
 438   2, /* vec_stmt_cost  */
 439   2, /* vec_permute_cost  */
 440   4, /* vec_to_scalar_cost  */
 441   4, /* scalar_to_vec_cost  */
 442   10, /* vec_align_load_cost  */
 443   10, /* vec_unalign_load_cost  */
 444   2, /* vec_unalign_store_cost  */
 445   2, /* vec_store_cost  */
 446   2, /* cond_taken_branch_cost  */
 447   1 /* cond_not_taken_branch_cost  */
 448 };
 449
 450 /* Costs for vector insn classes for Vulcan.  */
 451 static const struct cpu_vector_cost vulcan_vector_cost =
 452 {
 453   6, /* scalar_stmt_cost  */
 454   4, /* scalar_load_cost  */
 455   1, /* scalar_store_cost  */
 456   6, /* vec_stmt_cost  */
 457   3, /* vec_permute_cost  */
 458   6, /* vec_to_scalar_cost  */
 459   5, /* scalar_to_vec_cost  */
 460   8, /* vec_align_load_cost  */
 461   8, /* vec_unalign_load_cost  */
 462   4, /* vec_unalign_store_cost  */
 463   4, /* vec_store_cost  */
 464   2, /* cond_taken_branch_cost  */
 465   1  /* cond_not_taken_branch_cost  */
 466 };
 467
 468 /* Generic costs for branch instructions.  */
 469 static const struct cpu_branch_cost generic_branch_cost =
 470 {
 471   2,  /* Predictable.  */
 472   2   /* Unpredictable.  */
 473 };
 474
 475 /* Branch costs for Cortex-A57.  */
 476 static const struct cpu_branch_cost cortexa57_branch_cost =
 477 {
 478   1,  /* Predictable.  */
 479   3   /* Unpredictable.  */
 480 };
 481
 482 /* Branch costs for Vulcan.  */
 483 static const struct cpu_branch_cost vulcan_branch_cost =
 484 {
 485   1,  /* Predictable.  */
 486   3   /* Unpredictable.  */
 487 };
 488
 489 /* Generic approximation modes.  */
 490 static const cpu_approx_modes generic_approx_modes =
 491 {
 492   AARCH64_APPROX_NONE,  /* division  */
 493   AARCH64_APPROX_NONE,  /* sqrt  */
 494   AARCH64_APPROX_NONE   /* recip_sqrt  */
 495 };
 496
 497 /* Approximation modes for Exynos M1.  */
 498 static const cpu_approx_modes exynosm1_approx_modes =
 499 {
 500   AARCH64_APPROX_NONE,  /* division  */
 501   AARCH64_APPROX_ALL,   /* sqrt  */
 502   AARCH64_APPROX_ALL    /* recip_sqrt  */
 503 };
 504
 505 /* Approximation modes for X-Gene 1.  */
 506 static const cpu_approx_modes xgene1_approx_modes =
 507 {
 508   AARCH64_APPROX_NONE,  /* division  */
 509   AARCH64_APPROX_NONE,  /* sqrt  */
 510   AARCH64_APPROX_ALL    /* recip_sqrt  */
 511 };
 512
 513 static const struct tune_params generic_tunings =
 514 {
 515   &cortexa57_extra_costs,
 516   &generic_addrcost_table,
 517   &generic_regmove_cost,
 518   &generic_vector_cost,
 519   &generic_branch_cost,
 520   &generic_approx_modes,
 521   4, /* memmov_cost  */
 522   2, /* issue_rate  */
 523   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 524   8,    /* function_align.  */
 525   8,    /* jump_align.  */
 526   4,    /* loop_align.  */
 527   2,    /* int_reassoc_width.  */
 528   4,    /* fp_reassoc_width.  */
 529   1,    /* vec_reassoc_width.  */
 530   2,    /* min_div_recip_mul_sf.  */
 531   2,    /* min_div_recip_mul_df.  */
 532   0,    /* max_case_values.  */
 533   0,    /* cache_line_size.  */
 534   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 535   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 536 };
 537
 538 static const struct tune_params cortexa35_tunings =
 539 {
 540   &cortexa53_extra_costs,
 541   &generic_addrcost_table,
 542   &cortexa53_regmove_cost,
 543   &generic_vector_cost,
 544   &cortexa57_branch_cost,
 545   &generic_approx_modes,
 546   4, /* memmov_cost  */
 547   1, /* issue_rate  */
 548   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 549    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 550   16,   /* function_align.  */
 551   8,    /* jump_align.  */
 552   8,    /* loop_align.  */
 553   2,    /* int_reassoc_width.  */
 554   4,    /* fp_reassoc_width.  */
 555   1,    /* vec_reassoc_width.  */
 556   2,    /* min_div_recip_mul_sf.  */
 557   2,    /* min_div_recip_mul_df.  */
 558   0,    /* max_case_values.  */
 559   0,    /* cache_line_size.  */
 560   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 561   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 562 };
 563
 564 static const struct tune_params cortexa53_tunings =
 565 {
 566   &cortexa53_extra_costs,
 567   &generic_addrcost_table,
 568   &cortexa53_regmove_cost,
 569   &generic_vector_cost,
 570   &cortexa57_branch_cost,
 571   &generic_approx_modes,
 572   4, /* memmov_cost  */
 573   2, /* issue_rate  */
 574   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 575    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 576   16,   /* function_align.  */
 577   8,    /* jump_align.  */
 578   8,    /* loop_align.  */
 579   2,    /* int_reassoc_width.  */
 580   4,    /* fp_reassoc_width.  */
 581   1,    /* vec_reassoc_width.  */
 582   2,    /* min_div_recip_mul_sf.  */
 583   2,    /* min_div_recip_mul_df.  */
 584   0,    /* max_case_values.  */
 585   0,    /* cache_line_size.  */
 586   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 587   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 588 };
 589
 590 static const struct tune_params cortexa57_tunings =
 591 {
 592   &cortexa57_extra_costs,
 593   &cortexa57_addrcost_table,
 594   &cortexa57_regmove_cost,
 595   &cortexa57_vector_cost,
 596   &cortexa57_branch_cost,
 597   &generic_approx_modes,
 598   4, /* memmov_cost  */
 599   3, /* issue_rate  */
 600   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 601    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 602   16,   /* function_align.  */
 603   8,    /* jump_align.  */
 604   8,    /* loop_align.  */
 605   2,    /* int_reassoc_width.  */
 606   4,    /* fp_reassoc_width.  */
 607   1,    /* vec_reassoc_width.  */
 608   2,    /* min_div_recip_mul_sf.  */
 609   2,    /* min_div_recip_mul_df.  */
 610   0,    /* max_case_values.  */
 611   0,    /* cache_line_size.  */
 612   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 613   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)  /* tune_flags.  */
 614 };
 615
 616 static const struct tune_params cortexa72_tunings =
 617 {
 618   &cortexa57_extra_costs,
 619   &cortexa57_addrcost_table,
 620   &cortexa57_regmove_cost,
 621   &cortexa57_vector_cost,
 622   &cortexa57_branch_cost,
 623   &generic_approx_modes,
 624   4, /* memmov_cost  */
 625   3, /* issue_rate  */
 626   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 627    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 628   16,   /* function_align.  */
 629   8,    /* jump_align.  */
 630   8,    /* loop_align.  */
 631   2,    /* int_reassoc_width.  */
 632   4,    /* fp_reassoc_width.  */
 633   1,    /* vec_reassoc_width.  */
 634   2,    /* min_div_recip_mul_sf.  */
 635   2,    /* min_div_recip_mul_df.  */
 636   0,    /* max_case_values.  */
 637   0,    /* cache_line_size.  */
 638   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 639   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 640 };
 641
 642 static const struct tune_params cortexa73_tunings =
 643 {
 644   &cortexa57_extra_costs,
 645   &cortexa57_addrcost_table,
 646   &cortexa57_regmove_cost,
 647   &cortexa57_vector_cost,
 648   &cortexa57_branch_cost,
 649   &generic_approx_modes,
 650   4, /* memmov_cost.  */
 651   2, /* issue_rate.  */
 652   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 653    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 654   16,   /* function_align.  */
 655   8,    /* jump_align.  */
 656   8,    /* loop_align.  */
 657   2,    /* int_reassoc_width.  */
 658   4,    /* fp_reassoc_width.  */
 659   1,    /* vec_reassoc_width.  */
 660   2,    /* min_div_recip_mul_sf.  */
 661   2,    /* min_div_recip_mul_df.  */
 662   0,    /* max_case_values.  */
 663   0,    /* cache_line_size.  */
 664   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 665   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 666 };
 667
 668 static const struct tune_params exynosm1_tunings =
 669 {
 670   &exynosm1_extra_costs,
 671   &exynosm1_addrcost_table,
 672   &exynosm1_regmove_cost,
 673   &exynosm1_vector_cost,
 674   &generic_branch_cost,
 675   &exynosm1_approx_modes,
 676   4,    /* memmov_cost  */
 677   3,    /* issue_rate  */
 678   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 679   4,    /* function_align.  */
 680   4,    /* jump_align.  */
 681   4,    /* loop_align.  */
 682   2,    /* int_reassoc_width.  */
 683   4,    /* fp_reassoc_width.  */
 684   1,    /* vec_reassoc_width.  */
 685   2,    /* min_div_recip_mul_sf.  */
 686   2,    /* min_div_recip_mul_df.  */
 687   48,   /* max_case_values.  */
 688   64,   /* cache_line_size.  */
 689   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 690   (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
 691 };
 692
 693 static const struct tune_params thunderx_tunings =
 694 {
 695   &thunderx_extra_costs,
 696   &generic_addrcost_table,
 697   &thunderx_regmove_cost,
 698   &thunderx_vector_cost,
 699   &generic_branch_cost,
 700   &generic_approx_modes,
 701   6, /* memmov_cost  */
 702   2, /* issue_rate  */
 703   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 704   8,    /* function_align.  */
 705   8,    /* jump_align.  */
 706   8,    /* loop_align.  */
 707   2,    /* int_reassoc_width.  */
 708   4,    /* fp_reassoc_width.  */
 709   1,    /* vec_reassoc_width.  */
 710   2,    /* min_div_recip_mul_sf.  */
 711   2,    /* min_div_recip_mul_df.  */
 712   0,    /* max_case_values.  */
 713   0,    /* cache_line_size.  */
 714   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 715   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 716 };
 717
 718 static const struct tune_params xgene1_tunings =
 719 {
 720   &xgene1_extra_costs,
 721   &xgene1_addrcost_table,
 722   &xgene1_regmove_cost,
 723   &xgene1_vector_cost,
 724   &generic_branch_cost,
 725   &xgene1_approx_modes,
 726   6, /* memmov_cost  */
 727   4, /* issue_rate  */
 728   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 729   16,   /* function_align.  */
 730   8,    /* jump_align.  */
 731   16,   /* loop_align.  */
 732   2,    /* int_reassoc_width.  */
 733   4,    /* fp_reassoc_width.  */
 734   1,    /* vec_reassoc_width.  */
 735   2,    /* min_div_recip_mul_sf.  */
 736   2,    /* min_div_recip_mul_df.  */
 737   0,    /* max_case_values.  */
 738   0,    /* cache_line_size.  */
 739   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 740   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 741 };
 742
 743 static const struct tune_params qdf24xx_tunings =
 744 {
 745   &qdf24xx_extra_costs,
 746   &qdf24xx_addrcost_table,
 747   &qdf24xx_regmove_cost,
 748   &generic_vector_cost,
 749   &generic_branch_cost,
 750   &generic_approx_modes,
 751   4, /* memmov_cost  */
 752   4, /* issue_rate  */
 753   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 754    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 755   16,   /* function_align.  */
 756   8,    /* jump_align.  */
 757   16,   /* loop_align.  */
 758   2,    /* int_reassoc_width.  */
 759   4,    /* fp_reassoc_width.  */
 760   1,    /* vec_reassoc_width.  */
 761   2,    /* min_div_recip_mul_sf.  */
 762   2,    /* min_div_recip_mul_df.  */
 763   0,    /* max_case_values.  */
 764   64,   /* cache_line_size.  */
 765   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 766   (AARCH64_EXTRA_TUNE_NONE)             /* tune_flags.  */
 767 };
 768
 769 static const struct tune_params vulcan_tunings =
 770 {
 771   &vulcan_extra_costs,
 772   &vulcan_addrcost_table,
 773   &vulcan_regmove_cost,
 774   &vulcan_vector_cost,
 775   &vulcan_branch_cost,
 776   &generic_approx_modes,
 777   4, /* memmov_cost.  */
 778   4, /* issue_rate.  */
 779   AARCH64_FUSE_NOTHING, /* fuseable_ops.  */
 780   16,   /* function_align.  */
 781   8,    /* jump_align.  */
 782   16,   /* loop_align.  */
 783   3,    /* int_reassoc_width.  */
 784   2,    /* fp_reassoc_width.  */
 785   2,    /* vec_reassoc_width.  */
 786   2,    /* min_div_recip_mul_sf.  */
 787   2,    /* min_div_recip_mul_df.  */
 788   0,    /* max_case_values.  */
 789   64,   /* cache_line_size.  */
 790   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 791   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 792 };
 793
 794 /* Support for fine-grained override of the tuning structures.  */
 795 struct aarch64_tuning_override_function
 796 {
 797   const char* name;
 798   void (*parse_override)(const char*, struct tune_params*);
 799 };
 800
 801 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 802 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 803
 804 static const struct aarch64_tuning_override_function
 805 aarch64_tuning_override_functions[] =
 806 {
 807   { "fuse", aarch64_parse_fuse_string },
 808   { "tune", aarch64_parse_tune_string },
 809   { NULL, NULL }
 810 };
 811
 812 /* A processor implementing AArch64.  */
 813 struct processor
 814 {
 815   const char *const name;
 816   enum aarch64_processor ident;
 817   enum aarch64_processor sched_core;
 818   enum aarch64_arch arch;
 819   unsigned architecture_version;
 820   const unsigned long flags;
 821   const struct tune_params *const tune;
 822 };
 823
 824 /* Architectures implementing AArch64.  */
 825 static const struct processor all_architectures[] =
 826 {
 827 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 828   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 829 #include "aarch64-arches.def"
 830 #undef AARCH64_ARCH
 831   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 832 };
 833
 834 /* Processor cores implementing AArch64.  */
 835 static const struct processor all_cores[] =
 836 {
 837 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 838   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 839   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 840   FLAGS, &COSTS##_tunings},
 841 #include "aarch64-cores.def"
 842 #undef AARCH64_CORE
 843   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 844     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 845   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 846 };
 847
 848
 849 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 850    handling code or by target attributes.  */
 851 static const struct processor *selected_arch;
 852 static const struct processor *selected_cpu;
 853 static const struct processor *selected_tune;
 854
 855 /* The current tuning set.  */
 856 struct tune_params aarch64_tune_params = generic_tunings;
 857
 858 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 859
 860 /* An ISA extension in the co-processor and main instruction set space.  */
 861 struct aarch64_option_extension
 862 {
 863   const char *const name;
 864   const unsigned long flags_on;
 865   const unsigned long flags_off;
 866 };
 867
 868 typedef enum aarch64_cond_code
 869 {
 870   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 871   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 872   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 873 }
 874 aarch64_cc;
 875
 876 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 877
 878 /* The condition codes of the processor, and the inverse function.  */
 879 static const char * const aarch64_condition_codes[] =
 880 {
 881   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 882   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 883 };
 884
 885 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 886 const char *
 887 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 888                         const char * branch_format)
 889 {
 890     rtx_code_label * tmp_label = gen_label_rtx ();
 891     char label_buf[256];
 892     char buffer[128];
 893     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 894                                  CODE_LABEL_NUMBER (tmp_label));
 895     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 896     rtx dest_label = operands[pos_label];
 897     operands[pos_label] = tmp_label;
 898
 899     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 900     output_asm_insn (buffer, operands);
 901
 902     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 903     operands[pos_label] = dest_label;
 904     output_asm_insn (buffer, operands);
 905     return "";
 906 }
 907
 908 void
 909 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 910 {
 911   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 912   if (TARGET_GENERAL_REGS_ONLY)
 913     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 914   else
 915     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 916 }
 917
 918 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 919    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 920    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 921    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 922    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 923    irrespectively of its cost results in bad allocations with many redundant
 924    int<->FP moves which are expensive on various cores.
 925    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 926    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 927    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 928    Otherwise set the allocno class depending on the mode.
 929    The result of this is that it is no longer inefficient to have a higher
 930    memory move cost than the register move cost.
 931 */
 932
 933 static reg_class_t
 934 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 935                                          reg_class_t best_class)
 936 {
 937   enum machine_mode mode;
 938
 939   if (allocno_class != ALL_REGS)
 940     return allocno_class;
 941
 942   if (best_class != ALL_REGS)
 943     return best_class;
 944
 945   mode = PSEUDO_REGNO_MODE (regno);
 946   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
 947 }
 948
 949 static unsigned int
 950 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 951 {
 952   if (GET_MODE_UNIT_SIZE (mode) == 4)
 953     return aarch64_tune_params.min_div_recip_mul_sf;
 954   return aarch64_tune_params.min_div_recip_mul_df;
 955 }
 956
 957 static int
 958 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 959                              enum machine_mode mode)
 960 {
 961   if (VECTOR_MODE_P (mode))
 962     return aarch64_tune_params.vec_reassoc_width;
 963   if (INTEGRAL_MODE_P (mode))
 964     return aarch64_tune_params.int_reassoc_width;
 965   if (FLOAT_MODE_P (mode))
 966     return aarch64_tune_params.fp_reassoc_width;
 967   return 1;
 968 }
 969
 970 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 971 unsigned
 972 aarch64_dbx_register_number (unsigned regno)
 973 {
 974    if (GP_REGNUM_P (regno))
 975      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 976    else if (regno == SP_REGNUM)
 977      return AARCH64_DWARF_SP;
 978    else if (FP_REGNUM_P (regno))
 979      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 980
 981    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 982       equivalent DWARF register.  */
 983    return DWARF_FRAME_REGISTERS;
 984 }
 985
 986 /* Return TRUE if MODE is any of the large INT modes.  */
 987 static bool
 988 aarch64_vect_struct_mode_p (machine_mode mode)
 989 {
 990   return mode == OImode || mode == CImode || mode == XImode;
 991 }
 992
 993 /* Return TRUE if MODE is any of the vector modes.  */
 994 static bool
 995 aarch64_vector_mode_p (machine_mode mode)
 996 {
 997   return aarch64_vector_mode_supported_p (mode)
 998          || aarch64_vect_struct_mode_p (mode);
 999 }
1000
1001 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1002 static bool
1003 aarch64_array_mode_supported_p (machine_mode mode,
1004                                 unsigned HOST_WIDE_INT nelems)
1005 {
1006   if (TARGET_SIMD
1007       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1008           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1009       && (nelems >= 2 && nelems <= 4))
1010     return true;
1011
1012   return false;
1013 }
1014
1015 /* Implement HARD_REGNO_NREGS.  */
1016
1017 int
1018 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1019 {
1020   switch (aarch64_regno_regclass (regno))
1021     {
1022     case FP_REGS:
1023     case FP_LO_REGS:
1024       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1025     default:
1026       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1027     }
1028   gcc_unreachable ();
1029 }
1030
1031 /* Implement HARD_REGNO_MODE_OK.  */
1032
1033 int
1034 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1035 {
1036   if (GET_MODE_CLASS (mode) == MODE_CC)
1037     return regno == CC_REGNUM;
1038
1039   if (regno == SP_REGNUM)
1040     /* The purpose of comparing with ptr_mode is to support the
1041        global register variable associated with the stack pointer
1042        register via the syntax of asm ("wsp") in ILP32.  */
1043     return mode == Pmode || mode == ptr_mode;
1044
1045   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1046     return mode == Pmode;
1047
1048   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1049     return 1;
1050
1051   if (FP_REGNUM_P (regno))
1052     {
1053       if (aarch64_vect_struct_mode_p (mode))
1054         return
1055           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1056       else
1057         return 1;
1058     }
1059
1060   return 0;
1061 }
1062
1063 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1064 machine_mode
1065 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1066                                      machine_mode mode)
1067 {
1068   /* Handle modes that fit within single registers.  */
1069   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1070     {
1071       if (GET_MODE_SIZE (mode) >= 4)
1072         return mode;
1073       else
1074         return SImode;
1075     }
1076   /* Fall back to generic for multi-reg and very large modes.  */
1077   else
1078     return choose_hard_reg_mode (regno, nregs, false);
1079 }
1080
1081 /* Return true if calls to DECL should be treated as
1082    long-calls (ie called via a register).  */
1083 static bool
1084 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1085 {
1086   return false;
1087 }
1088
1089 /* Return true if calls to symbol-ref SYM should be treated as
1090    long-calls (ie called via a register).  */
1091 bool
1092 aarch64_is_long_call_p (rtx sym)
1093 {
1094   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1095 }
1096
1097 /* Return true if calls to symbol-ref SYM should not go through
1098    plt stubs.  */
1099
1100 bool
1101 aarch64_is_noplt_call_p (rtx sym)
1102 {
1103   const_tree decl = SYMBOL_REF_DECL (sym);
1104
1105   if (flag_pic
1106       && decl
1107       && (!flag_plt
1108           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1109       && !targetm.binds_local_p (decl))
1110     return true;
1111
1112   return false;
1113 }
1114
1115 /* Return true if the offsets to a zero/sign-extract operation
1116    represent an expression that matches an extend operation.  The
1117    operands represent the paramters from
1118
1119    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1120 bool
1121 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1122                                 rtx extract_imm)
1123 {
1124   HOST_WIDE_INT mult_val, extract_val;
1125
1126   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1127     return false;
1128
1129   mult_val = INTVAL (mult_imm);
1130   extract_val = INTVAL (extract_imm);
1131
1132   if (extract_val > 8
1133       && extract_val < GET_MODE_BITSIZE (mode)
1134       && exact_log2 (extract_val & ~7) > 0
1135       && (extract_val & 7) <= 4
1136       && mult_val == (1 << (extract_val & 7)))
1137     return true;
1138
1139   return false;
1140 }
1141
1142 /* Emit an insn that's a simple single-set.  Both the operands must be
1143    known to be valid.  */
1144 inline static rtx
1145 emit_set_insn (rtx x, rtx y)
1146 {
1147   return emit_insn (gen_rtx_SET (x, y));
1148 }
1149
1150 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1151    return the rtx for register 0 in the proper mode.  */
1152 rtx
1153 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1154 {
1155   machine_mode mode = SELECT_CC_MODE (code, x, y);
1156   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1157
1158   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1159   return cc_reg;
1160 }
1161
1162 /* Build the SYMBOL_REF for __tls_get_addr.  */
1163
1164 static GTY(()) rtx tls_get_addr_libfunc;
1165
1166 rtx
1167 aarch64_tls_get_addr (void)
1168 {
1169   if (!tls_get_addr_libfunc)
1170     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1171   return tls_get_addr_libfunc;
1172 }
1173
1174 /* Return the TLS model to use for ADDR.  */
1175
1176 static enum tls_model
1177 tls_symbolic_operand_type (rtx addr)
1178 {
1179   enum tls_model tls_kind = TLS_MODEL_NONE;
1180   rtx sym, addend;
1181
1182   if (GET_CODE (addr) == CONST)
1183     {
1184       split_const (addr, &sym, &addend);
1185       if (GET_CODE (sym) == SYMBOL_REF)
1186         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1187     }
1188   else if (GET_CODE (addr) == SYMBOL_REF)
1189     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1190
1191   return tls_kind;
1192 }
1193
1194 /* We'll allow lo_sum's in addresses in our legitimate addresses
1195    so that combine would take care of combining addresses where
1196    necessary, but for generation purposes, we'll generate the address
1197    as :
1198    RTL                               Absolute
1199    tmp = hi (symbol_ref);            adrp  x1, foo
1200    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1201                                      nop
1202
1203    PIC                               TLS
1204    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1205    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1206                                      bl   __tls_get_addr
1207                                      nop
1208
1209    Load TLS symbol, depending on TLS mechanism and TLS access model.
1210
1211    Global Dynamic - Traditional TLS:
1212    adrp tmp, :tlsgd:imm
1213    add  dest, tmp, #:tlsgd_lo12:imm
1214    bl   __tls_get_addr
1215
1216    Global Dynamic - TLS Descriptors:
1217    adrp dest, :tlsdesc:imm
1218    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1219    add  dest, dest, #:tlsdesc_lo12:imm
1220    blr  tmp
1221    mrs  tp, tpidr_el0
1222    add  dest, dest, tp
1223
1224    Initial Exec:
1225    mrs  tp, tpidr_el0
1226    adrp tmp, :gottprel:imm
1227    ldr  dest, [tmp, #:gottprel_lo12:imm]
1228    add  dest, dest, tp
1229
1230    Local Exec:
1231    mrs  tp, tpidr_el0
1232    add  t0, tp, #:tprel_hi12:imm, lsl #12
1233    add  t0, t0, #:tprel_lo12_nc:imm
1234 */
1235
1236 static void
1237 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1238                                    enum aarch64_symbol_type type)
1239 {
1240   switch (type)
1241     {
1242     case SYMBOL_SMALL_ABSOLUTE:
1243       {
1244         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1245         rtx tmp_reg = dest;
1246         machine_mode mode = GET_MODE (dest);
1247
1248         gcc_assert (mode == Pmode || mode == ptr_mode);
1249
1250         if (can_create_pseudo_p ())
1251           tmp_reg = gen_reg_rtx (mode);
1252
1253         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1254         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1255         return;
1256       }
1257
1258     case SYMBOL_TINY_ABSOLUTE:
1259       emit_insn (gen_rtx_SET (dest, imm));
1260       return;
1261
1262     case SYMBOL_SMALL_GOT_28K:
1263       {
1264         machine_mode mode = GET_MODE (dest);
1265         rtx gp_rtx = pic_offset_table_rtx;
1266         rtx insn;
1267         rtx mem;
1268
1269         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1270            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1271            decide rtx costs, in which case pic_offset_table_rtx is not
1272            initialized.  For that case no need to generate the first adrp
1273            instruction as the final cost for global variable access is
1274            one instruction.  */
1275         if (gp_rtx != NULL)
1276           {
1277             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1278                using the page base as GOT base, the first page may be wasted,
1279                in the worst scenario, there is only 28K space for GOT).
1280
1281                The generate instruction sequence for accessing global variable
1282                is:
1283
1284                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1285
1286                Only one instruction needed. But we must initialize
1287                pic_offset_table_rtx properly.  We generate initialize insn for
1288                every global access, and allow CSE to remove all redundant.
1289
1290                The final instruction sequences will look like the following
1291                for multiply global variables access.
1292
1293                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1294
1295                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1296                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1297                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1298                  ...  */
1299
1300             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1301             crtl->uses_pic_offset_table = 1;
1302             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1303
1304             if (mode != GET_MODE (gp_rtx))
1305               gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
1306           }
1307
1308         if (mode == ptr_mode)
1309           {
1310             if (mode == DImode)
1311               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1312             else
1313               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1314
1315             mem = XVECEXP (SET_SRC (insn), 0, 0);
1316           }
1317         else
1318           {
1319             gcc_assert (mode == Pmode);
1320
1321             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1322             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1323           }
1324
1325         /* The operand is expected to be MEM.  Whenever the related insn
1326            pattern changed, above code which calculate mem should be
1327            updated.  */
1328         gcc_assert (GET_CODE (mem) == MEM);
1329         MEM_READONLY_P (mem) = 1;
1330         MEM_NOTRAP_P (mem) = 1;
1331         emit_insn (insn);
1332         return;
1333       }
1334
1335     case SYMBOL_SMALL_GOT_4G:
1336       {
1337         /* In ILP32, the mode of dest can be either SImode or DImode,
1338            while the got entry is always of SImode size.  The mode of
1339            dest depends on how dest is used: if dest is assigned to a
1340            pointer (e.g. in the memory), it has SImode; it may have
1341            DImode if dest is dereferenced to access the memeory.
1342            This is why we have to handle three different ldr_got_small
1343            patterns here (two patterns for ILP32).  */
1344
1345         rtx insn;
1346         rtx mem;
1347         rtx tmp_reg = dest;
1348         machine_mode mode = GET_MODE (dest);
1349
1350         if (can_create_pseudo_p ())
1351           tmp_reg = gen_reg_rtx (mode);
1352
1353         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1354         if (mode == ptr_mode)
1355           {
1356             if (mode == DImode)
1357               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1358             else
1359               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1360
1361             mem = XVECEXP (SET_SRC (insn), 0, 0);
1362           }
1363         else
1364           {
1365             gcc_assert (mode == Pmode);
1366
1367             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1368             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1369           }
1370
1371         gcc_assert (GET_CODE (mem) == MEM);
1372         MEM_READONLY_P (mem) = 1;
1373         MEM_NOTRAP_P (mem) = 1;
1374         emit_insn (insn);
1375         return;
1376       }
1377
1378     case SYMBOL_SMALL_TLSGD:
1379       {
1380         rtx_insn *insns;
1381         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1382
1383         start_sequence ();
1384         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1385         insns = get_insns ();
1386         end_sequence ();
1387
1388         RTL_CONST_CALL_P (insns) = 1;
1389         emit_libcall_block (insns, dest, result, imm);
1390         return;
1391       }
1392
1393     case SYMBOL_SMALL_TLSDESC:
1394       {
1395         machine_mode mode = GET_MODE (dest);
1396         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1397         rtx tp;
1398
1399         gcc_assert (mode == Pmode || mode == ptr_mode);
1400
1401         /* In ILP32, the got entry is always of SImode size.  Unlike
1402            small GOT, the dest is fixed at reg 0.  */
1403         if (TARGET_ILP32)
1404           emit_insn (gen_tlsdesc_small_si (imm));
1405         else
1406           emit_insn (gen_tlsdesc_small_di (imm));
1407         tp = aarch64_load_tp (NULL);
1408
1409         if (mode != Pmode)
1410           tp = gen_lowpart (mode, tp);
1411
1412         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1413         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1414         return;
1415       }
1416
1417     case SYMBOL_SMALL_TLSIE:
1418       {
1419         /* In ILP32, the mode of dest can be either SImode or DImode,
1420            while the got entry is always of SImode size.  The mode of
1421            dest depends on how dest is used: if dest is assigned to a
1422            pointer (e.g. in the memory), it has SImode; it may have
1423            DImode if dest is dereferenced to access the memeory.
1424            This is why we have to handle three different tlsie_small
1425            patterns here (two patterns for ILP32).  */
1426         machine_mode mode = GET_MODE (dest);
1427         rtx tmp_reg = gen_reg_rtx (mode);
1428         rtx tp = aarch64_load_tp (NULL);
1429
1430         if (mode == ptr_mode)
1431           {
1432             if (mode == DImode)
1433               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1434             else
1435               {
1436                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1437                 tp = gen_lowpart (mode, tp);
1438               }
1439           }
1440         else
1441           {
1442             gcc_assert (mode == Pmode);
1443             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1444           }
1445
1446         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1447         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1448         return;
1449       }
1450
1451     case SYMBOL_TLSLE12:
1452     case SYMBOL_TLSLE24:
1453     case SYMBOL_TLSLE32:
1454     case SYMBOL_TLSLE48:
1455       {
1456         machine_mode mode = GET_MODE (dest);
1457         rtx tp = aarch64_load_tp (NULL);
1458
1459         if (mode != Pmode)
1460           tp = gen_lowpart (mode, tp);
1461
1462         switch (type)
1463           {
1464           case SYMBOL_TLSLE12:
1465             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1466                         (dest, tp, imm));
1467             break;
1468           case SYMBOL_TLSLE24:
1469             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1470                         (dest, tp, imm));
1471           break;
1472           case SYMBOL_TLSLE32:
1473             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1474                         (dest, imm));
1475             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1476                         (dest, dest, tp));
1477           break;
1478           case SYMBOL_TLSLE48:
1479             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1480                         (dest, imm));
1481             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1482                         (dest, dest, tp));
1483             break;
1484           default:
1485             gcc_unreachable ();
1486           }
1487
1488         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1489         return;
1490       }
1491
1492     case SYMBOL_TINY_GOT:
1493       emit_insn (gen_ldr_got_tiny (dest, imm));
1494       return;
1495
1496     case SYMBOL_TINY_TLSIE:
1497       {
1498         machine_mode mode = GET_MODE (dest);
1499         rtx tp = aarch64_load_tp (NULL);
1500
1501         if (mode == ptr_mode)
1502           {
1503             if (mode == DImode)
1504               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1505             else
1506               {
1507                 tp = gen_lowpart (mode, tp);
1508                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1509               }
1510           }
1511         else
1512           {
1513             gcc_assert (mode == Pmode);
1514             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1515           }
1516
1517         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1518         return;
1519       }
1520
1521     default:
1522       gcc_unreachable ();
1523     }
1524 }
1525
1526 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1527    handle all moves if !can_create_pseudo_p ().  The distinction is
1528    important because, unlike emit_move_insn, the move expanders know
1529    how to force Pmode objects into the constant pool even when the
1530    constant pool address is not itself legitimate.  */
1531 static rtx
1532 aarch64_emit_move (rtx dest, rtx src)
1533 {
1534   return (can_create_pseudo_p ()
1535           ? emit_move_insn (dest, src)
1536           : emit_move_insn_1 (dest, src));
1537 }
1538
1539 /* Split a 128-bit move operation into two 64-bit move operations,
1540    taking care to handle partial overlap of register to register
1541    copies.  Special cases are needed when moving between GP regs and
1542    FP regs.  SRC can be a register, constant or memory; DST a register
1543    or memory.  If either operand is memory it must not have any side
1544    effects.  */
1545 void
1546 aarch64_split_128bit_move (rtx dst, rtx src)
1547 {
1548   rtx dst_lo, dst_hi;
1549   rtx src_lo, src_hi;
1550
1551   machine_mode mode = GET_MODE (dst);
1552
1553   gcc_assert (mode == TImode || mode == TFmode);
1554   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1555   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1556
1557   if (REG_P (dst) && REG_P (src))
1558     {
1559       int src_regno = REGNO (src);
1560       int dst_regno = REGNO (dst);
1561
1562       /* Handle FP <-> GP regs.  */
1563       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1564         {
1565           src_lo = gen_lowpart (word_mode, src);
1566           src_hi = gen_highpart (word_mode, src);
1567
1568           if (mode == TImode)
1569             {
1570               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1571               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1572             }
1573           else
1574             {
1575               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1576               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1577             }
1578           return;
1579         }
1580       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1581         {
1582           dst_lo = gen_lowpart (word_mode, dst);
1583           dst_hi = gen_highpart (word_mode, dst);
1584
1585           if (mode == TImode)
1586             {
1587               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1588               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1589             }
1590           else
1591             {
1592               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1593               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1594             }
1595           return;
1596         }
1597     }
1598
1599   dst_lo = gen_lowpart (word_mode, dst);
1600   dst_hi = gen_highpart (word_mode, dst);
1601   src_lo = gen_lowpart (word_mode, src);
1602   src_hi = gen_highpart_mode (word_mode, mode, src);
1603
1604   /* At most one pairing may overlap.  */
1605   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1606     {
1607       aarch64_emit_move (dst_hi, src_hi);
1608       aarch64_emit_move (dst_lo, src_lo);
1609     }
1610   else
1611     {
1612       aarch64_emit_move (dst_lo, src_lo);
1613       aarch64_emit_move (dst_hi, src_hi);
1614     }
1615 }
1616
1617 bool
1618 aarch64_split_128bit_move_p (rtx dst, rtx src)
1619 {
1620   return (! REG_P (src)
1621           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1622 }
1623
1624 /* Split a complex SIMD combine.  */
1625
1626 void
1627 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1628 {
1629   machine_mode src_mode = GET_MODE (src1);
1630   machine_mode dst_mode = GET_MODE (dst);
1631
1632   gcc_assert (VECTOR_MODE_P (dst_mode));
1633
1634   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1635     {
1636       rtx (*gen) (rtx, rtx, rtx);
1637
1638       switch (src_mode)
1639         {
1640         case V8QImode:
1641           gen = gen_aarch64_simd_combinev8qi;
1642           break;
1643         case V4HImode:
1644           gen = gen_aarch64_simd_combinev4hi;
1645           break;
1646         case V2SImode:
1647           gen = gen_aarch64_simd_combinev2si;
1648           break;
1649         case V4HFmode:
1650           gen = gen_aarch64_simd_combinev4hf;
1651           break;
1652         case V2SFmode:
1653           gen = gen_aarch64_simd_combinev2sf;
1654           break;
1655         case DImode:
1656           gen = gen_aarch64_simd_combinedi;
1657           break;
1658         case DFmode:
1659           gen = gen_aarch64_simd_combinedf;
1660           break;
1661         default:
1662           gcc_unreachable ();
1663         }
1664
1665       emit_insn (gen (dst, src1, src2));
1666       return;
1667     }
1668 }
1669
1670 /* Split a complex SIMD move.  */
1671
1672 void
1673 aarch64_split_simd_move (rtx dst, rtx src)
1674 {
1675   machine_mode src_mode = GET_MODE (src);
1676   machine_mode dst_mode = GET_MODE (dst);
1677
1678   gcc_assert (VECTOR_MODE_P (dst_mode));
1679
1680   if (REG_P (dst) && REG_P (src))
1681     {
1682       rtx (*gen) (rtx, rtx);
1683
1684       gcc_assert (VECTOR_MODE_P (src_mode));
1685
1686       switch (src_mode)
1687         {
1688         case V16QImode:
1689           gen = gen_aarch64_split_simd_movv16qi;
1690           break;
1691         case V8HImode:
1692           gen = gen_aarch64_split_simd_movv8hi;
1693           break;
1694         case V4SImode:
1695           gen = gen_aarch64_split_simd_movv4si;
1696           break;
1697         case V2DImode:
1698           gen = gen_aarch64_split_simd_movv2di;
1699           break;
1700         case V8HFmode:
1701           gen = gen_aarch64_split_simd_movv8hf;
1702           break;
1703         case V4SFmode:
1704           gen = gen_aarch64_split_simd_movv4sf;
1705           break;
1706         case V2DFmode:
1707           gen = gen_aarch64_split_simd_movv2df;
1708           break;
1709         default:
1710           gcc_unreachable ();
1711         }
1712
1713       emit_insn (gen (dst, src));
1714       return;
1715     }
1716 }
1717
1718 bool
1719 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1720                               machine_mode ymode, rtx y)
1721 {
1722   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1723   gcc_assert (r != NULL);
1724   return rtx_equal_p (x, r);
1725 }
1726
1727
1728 static rtx
1729 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1730 {
1731   if (can_create_pseudo_p ())
1732     return force_reg (mode, value);
1733   else
1734     {
1735       x = aarch64_emit_move (x, value);
1736       return x;
1737     }
1738 }
1739
1740
1741 static rtx
1742 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1743 {
1744   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1745     {
1746       rtx high;
1747       /* Load the full offset into a register.  This
1748          might be improvable in the future.  */
1749       high = GEN_INT (offset);
1750       offset = 0;
1751       high = aarch64_force_temporary (mode, temp, high);
1752       reg = aarch64_force_temporary (mode, temp,
1753                                      gen_rtx_PLUS (mode, high, reg));
1754     }
1755   return plus_constant (mode, reg, offset);
1756 }
1757
1758 static int
1759 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1760                                 machine_mode mode)
1761 {
1762   int i;
1763   unsigned HOST_WIDE_INT val, val2, mask;
1764   int one_match, zero_match;
1765   int num_insns;
1766
1767   val = INTVAL (imm);
1768
1769   if (aarch64_move_imm (val, mode))
1770     {
1771       if (generate)
1772         emit_insn (gen_rtx_SET (dest, imm));
1773       return 1;
1774     }
1775
1776   if ((val >> 32) == 0 || mode == SImode)
1777     {
1778       if (generate)
1779         {
1780           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1781           if (mode == SImode)
1782             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1783                                        GEN_INT ((val >> 16) & 0xffff)));
1784           else
1785             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1786                                        GEN_INT ((val >> 16) & 0xffff)));
1787         }
1788       return 2;
1789     }
1790
1791   /* Remaining cases are all for DImode.  */
1792
1793   mask = 0xffff;
1794   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1795     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1796   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1797     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1798
1799   if (zero_match != 2 && one_match != 2)
1800     {
1801       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1802          For a 64-bit bitmask try whether changing 16 bits to all ones or
1803          zeroes creates a valid bitmask.  To check any repeated bitmask,
1804          try using 16 bits from the other 32-bit half of val.  */
1805
1806       for (i = 0; i < 64; i += 16, mask <<= 16)
1807         {
1808           val2 = val & ~mask;
1809           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1810             break;
1811           val2 = val | mask;
1812           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1813             break;
1814           val2 = val2 & ~mask;
1815           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1816           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1817             break;
1818         }
1819       if (i != 64)
1820         {
1821           if (generate)
1822             {
1823               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1824               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1825                                          GEN_INT ((val >> i) & 0xffff)));
1826             }
1827           return 2;
1828         }
1829     }
1830
1831   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1832      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1833      otherwise skip zero bits.  */
1834
1835   num_insns = 1;
1836   mask = 0xffff;
1837   val2 = one_match > zero_match ? ~val : val;
1838   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1839
1840   if (generate)
1841     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1842                                            ? (val | ~(mask << i))
1843                                            : (val & (mask << i)))));
1844   for (i += 16; i < 64; i += 16)
1845     {
1846       if ((val2 & (mask << i)) == 0)
1847         continue;
1848       if (generate)
1849         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1850                                    GEN_INT ((val >> i) & 0xffff)));
1851       num_insns ++;
1852     }
1853
1854   return num_insns;
1855 }
1856
1857
1858 void
1859 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1860 {
1861   machine_mode mode = GET_MODE (dest);
1862
1863   gcc_assert (mode == SImode || mode == DImode);
1864
1865   /* Check on what type of symbol it is.  */
1866   if (GET_CODE (imm) == SYMBOL_REF
1867       || GET_CODE (imm) == LABEL_REF
1868       || GET_CODE (imm) == CONST)
1869     {
1870       rtx mem, base, offset;
1871       enum aarch64_symbol_type sty;
1872
1873       /* If we have (const (plus symbol offset)), separate out the offset
1874          before we start classifying the symbol.  */
1875       split_const (imm, &base, &offset);
1876
1877       sty = aarch64_classify_symbol (base, offset);
1878       switch (sty)
1879         {
1880         case SYMBOL_FORCE_TO_MEM:
1881           if (offset != const0_rtx
1882               && targetm.cannot_force_const_mem (mode, imm))
1883             {
1884               gcc_assert (can_create_pseudo_p ());
1885               base = aarch64_force_temporary (mode, dest, base);
1886               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1887               aarch64_emit_move (dest, base);
1888               return;
1889             }
1890
1891           mem = force_const_mem (ptr_mode, imm);
1892           gcc_assert (mem);
1893
1894           /* If we aren't generating PC relative literals, then
1895              we need to expand the literal pool access carefully.
1896              This is something that needs to be done in a number
1897              of places, so could well live as a separate function.  */
1898           if (!aarch64_pcrelative_literal_loads)
1899             {
1900               gcc_assert (can_create_pseudo_p ());
1901               base = gen_reg_rtx (ptr_mode);
1902               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1903               mem = gen_rtx_MEM (ptr_mode, base);
1904             }
1905
1906           if (mode != ptr_mode)
1907             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1908
1909           emit_insn (gen_rtx_SET (dest, mem));
1910
1911           return;
1912
1913         case SYMBOL_SMALL_TLSGD:
1914         case SYMBOL_SMALL_TLSDESC:
1915         case SYMBOL_SMALL_TLSIE:
1916         case SYMBOL_SMALL_GOT_28K:
1917         case SYMBOL_SMALL_GOT_4G:
1918         case SYMBOL_TINY_GOT:
1919         case SYMBOL_TINY_TLSIE:
1920           if (offset != const0_rtx)
1921             {
1922               gcc_assert(can_create_pseudo_p ());
1923               base = aarch64_force_temporary (mode, dest, base);
1924               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1925               aarch64_emit_move (dest, base);
1926               return;
1927             }
1928           /* FALLTHRU */
1929
1930         case SYMBOL_SMALL_ABSOLUTE:
1931         case SYMBOL_TINY_ABSOLUTE:
1932         case SYMBOL_TLSLE12:
1933         case SYMBOL_TLSLE24:
1934         case SYMBOL_TLSLE32:
1935         case SYMBOL_TLSLE48:
1936           aarch64_load_symref_appropriately (dest, imm, sty);
1937           return;
1938
1939         default:
1940           gcc_unreachable ();
1941         }
1942     }
1943
1944   if (!CONST_INT_P (imm))
1945     {
1946       if (GET_CODE (imm) == HIGH)
1947         emit_insn (gen_rtx_SET (dest, imm));
1948       else
1949         {
1950           rtx mem = force_const_mem (mode, imm);
1951           gcc_assert (mem);
1952           emit_insn (gen_rtx_SET (dest, mem));
1953         }
1954
1955       return;
1956     }
1957
1958   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1959 }
1960
1961 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to held
1962    intermediate value if necessary.
1963
1964    This function is sometimes used to adjust the stack pointer, so we must
1965    ensure that it can never cause transient stack deallocation by writing an
1966    invalid value into REGNUM.  */
1967
1968 static void
1969 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
1970                       HOST_WIDE_INT delta, bool frame_related_p)
1971 {
1972   HOST_WIDE_INT mdelta = abs_hwi (delta);
1973   rtx this_rtx = gen_rtx_REG (mode, regnum);
1974   rtx_insn *insn;
1975
1976   /* Do nothing if mdelta is zero.  */
1977   if (!mdelta)
1978     return;
1979
1980   /* We only need single instruction if the offset fit into add/sub.  */
1981   if (aarch64_uimm12_shift (mdelta))
1982     {
1983       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
1984       RTX_FRAME_RELATED_P (insn) = frame_related_p;
1985       return;
1986     }
1987
1988   /* We need two add/sub instructions, each one performing part of the
1989      calculation.  Don't do this if the addend can be loaded into register with
1990      a single instruction, in that case we prefer a move to a scratch register
1991      following by an addition.  */
1992   if (mdelta < 0x1000000 && !aarch64_move_imm (delta, mode))
1993     {
1994       HOST_WIDE_INT low_off = mdelta & 0xfff;
1995
1996       low_off = delta < 0 ? -low_off : low_off;
1997       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
1998       RTX_FRAME_RELATED_P (insn) = frame_related_p;
1999       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2000       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2001       return;
2002     }
2003
2004   /* Otherwise use generic function to handle all other situations.  */
2005   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2006   aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true, mode);
2007   insn = emit_insn (gen_add2_insn (this_rtx, scratch_rtx));
2008   if (frame_related_p)
2009     {
2010       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2011       rtx adj = plus_constant (mode, this_rtx, delta);
2012       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2013     }
2014 }
2015
2016 static bool
2017 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2018                                  tree exp ATTRIBUTE_UNUSED)
2019 {
2020   /* Currently, always true.  */
2021   return true;
2022 }
2023
2024 /* Implement TARGET_PASS_BY_REFERENCE.  */
2025
2026 static bool
2027 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2028                            machine_mode mode,
2029                            const_tree type,
2030                            bool named ATTRIBUTE_UNUSED)
2031 {
2032   HOST_WIDE_INT size;
2033   machine_mode dummymode;
2034   int nregs;
2035
2036   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2037   size = (mode == BLKmode && type)
2038     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2039
2040   /* Aggregates are passed by reference based on their size.  */
2041   if (type && AGGREGATE_TYPE_P (type))
2042     {
2043       size = int_size_in_bytes (type);
2044     }
2045
2046   /* Variable sized arguments are always returned by reference.  */
2047   if (size < 0)
2048     return true;
2049
2050   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2051   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2052                                                &dummymode, &nregs,
2053                                                NULL))
2054     return false;
2055
2056   /* Arguments which are variable sized or larger than 2 registers are
2057      passed by reference unless they are a homogenous floating point
2058      aggregate.  */
2059   return size > 2 * UNITS_PER_WORD;
2060 }
2061
2062 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2063 static bool
2064 aarch64_return_in_msb (const_tree valtype)
2065 {
2066   machine_mode dummy_mode;
2067   int dummy_int;
2068
2069   /* Never happens in little-endian mode.  */
2070   if (!BYTES_BIG_ENDIAN)
2071     return false;
2072
2073   /* Only composite types smaller than or equal to 16 bytes can
2074      be potentially returned in registers.  */
2075   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2076       || int_size_in_bytes (valtype) <= 0
2077       || int_size_in_bytes (valtype) > 16)
2078     return false;
2079
2080   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2081      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2082      is always passed/returned in the least significant bits of fp/simd
2083      register(s).  */
2084   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2085                                                &dummy_mode, &dummy_int, NULL))
2086     return false;
2087
2088   return true;
2089 }
2090
2091 /* Implement TARGET_FUNCTION_VALUE.
2092    Define how to find the value returned by a function.  */
2093
2094 static rtx
2095 aarch64_function_value (const_tree type, const_tree func,
2096                         bool outgoing ATTRIBUTE_UNUSED)
2097 {
2098   machine_mode mode;
2099   int unsignedp;
2100   int count;
2101   machine_mode ag_mode;
2102
2103   mode = TYPE_MODE (type);
2104   if (INTEGRAL_TYPE_P (type))
2105     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2106
2107   if (aarch64_return_in_msb (type))
2108     {
2109       HOST_WIDE_INT size = int_size_in_bytes (type);
2110
2111       if (size % UNITS_PER_WORD != 0)
2112         {
2113           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2114           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2115         }
2116     }
2117
2118   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2119                                                &ag_mode, &count, NULL))
2120     {
2121       if (!aarch64_composite_type_p (type, mode))
2122         {
2123           gcc_assert (count == 1 && mode == ag_mode);
2124           return gen_rtx_REG (mode, V0_REGNUM);
2125         }
2126       else
2127         {
2128           int i;
2129           rtx par;
2130
2131           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2132           for (i = 0; i < count; i++)
2133             {
2134               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2135               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2136                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2137               XVECEXP (par, 0, i) = tmp;
2138             }
2139           return par;
2140         }
2141     }
2142   else
2143     return gen_rtx_REG (mode, R0_REGNUM);
2144 }
2145
2146 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2147    Return true if REGNO is the number of a hard register in which the values
2148    of called function may come back.  */
2149
2150 static bool
2151 aarch64_function_value_regno_p (const unsigned int regno)
2152 {
2153   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2154      of 16-byte return values are: 128-bit integers and 16-byte small
2155      structures (excluding homogeneous floating-point aggregates).  */
2156   if (regno == R0_REGNUM || regno == R1_REGNUM)
2157     return true;
2158
2159   /* Up to four fp/simd registers can return a function value, e.g. a
2160      homogeneous floating-point aggregate having four members.  */
2161   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2162     return TARGET_FLOAT;
2163
2164   return false;
2165 }
2166
2167 /* Implement TARGET_RETURN_IN_MEMORY.
2168
2169    If the type T of the result of a function is such that
2170      void func (T arg)
2171    would require that arg be passed as a value in a register (or set of
2172    registers) according to the parameter passing rules, then the result
2173    is returned in the same registers as would be used for such an
2174    argument.  */
2175
2176 static bool
2177 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2178 {
2179   HOST_WIDE_INT size;
2180   machine_mode ag_mode;
2181   int count;
2182
2183   if (!AGGREGATE_TYPE_P (type)
2184       && TREE_CODE (type) != COMPLEX_TYPE
2185       && TREE_CODE (type) != VECTOR_TYPE)
2186     /* Simple scalar types always returned in registers.  */
2187     return false;
2188
2189   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2190                                                type,
2191                                                &ag_mode,
2192                                                &count,
2193                                                NULL))
2194     return false;
2195
2196   /* Types larger than 2 registers returned in memory.  */
2197   size = int_size_in_bytes (type);
2198   return (size < 0 || size > 2 * UNITS_PER_WORD);
2199 }
2200
2201 static bool
2202 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2203                                const_tree type, int *nregs)
2204 {
2205   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2206   return aarch64_vfp_is_call_or_return_candidate (mode,
2207                                                   type,
2208                                                   &pcum->aapcs_vfp_rmode,
2209                                                   nregs,
2210                                                   NULL);
2211 }
2212
2213 /* Given MODE and TYPE of a function argument, return the alignment in
2214    bits.  The idea is to suppress any stronger alignment requested by
2215    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2216    This is a helper function for local use only.  */
2217
2218 static unsigned int
2219 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2220 {
2221   if (!type)
2222     return GET_MODE_ALIGNMENT (mode);
2223   if (integer_zerop (TYPE_SIZE (type)))
2224     return 0;
2225
2226   gcc_assert (TYPE_MODE (type) == mode);
2227
2228   if (!AGGREGATE_TYPE_P (type))
2229     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2230
2231   if (TREE_CODE (type) == ARRAY_TYPE)
2232     return TYPE_ALIGN (TREE_TYPE (type));
2233
2234   unsigned int alignment = 0;
2235
2236   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2237     alignment = std::max (alignment, DECL_ALIGN (field));
2238
2239   return alignment;
2240 }
2241
2242 /* Layout a function argument according to the AAPCS64 rules.  The rule
2243    numbers refer to the rule numbers in the AAPCS64.  */
2244
2245 static void
2246 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2247                     const_tree type,
2248                     bool named ATTRIBUTE_UNUSED)
2249 {
2250   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2251   int ncrn, nvrn, nregs;
2252   bool allocate_ncrn, allocate_nvrn;
2253   HOST_WIDE_INT size;
2254
2255   /* We need to do this once per argument.  */
2256   if (pcum->aapcs_arg_processed)
2257     return;
2258
2259   pcum->aapcs_arg_processed = true;
2260
2261   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2262   size
2263     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2264                 UNITS_PER_WORD);
2265
2266   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2267   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2268                                                  mode,
2269                                                  type,
2270                                                  &nregs);
2271
2272   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2273      The following code thus handles passing by SIMD/FP registers first.  */
2274
2275   nvrn = pcum->aapcs_nvrn;
2276
2277   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2278      and homogenous short-vector aggregates (HVA).  */
2279   if (allocate_nvrn)
2280     {
2281       if (!TARGET_FLOAT)
2282         aarch64_err_no_fpadvsimd (mode, "argument");
2283
2284       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2285         {
2286           pcum->aapcs_nextnvrn = nvrn + nregs;
2287           if (!aarch64_composite_type_p (type, mode))
2288             {
2289               gcc_assert (nregs == 1);
2290               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2291             }
2292           else
2293             {
2294               rtx par;
2295               int i;
2296               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2297               for (i = 0; i < nregs; i++)
2298                 {
2299                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2300                                          V0_REGNUM + nvrn + i);
2301                   tmp = gen_rtx_EXPR_LIST
2302                     (VOIDmode, tmp,
2303                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2304                   XVECEXP (par, 0, i) = tmp;
2305                 }
2306               pcum->aapcs_reg = par;
2307             }
2308           return;
2309         }
2310       else
2311         {
2312           /* C.3 NSRN is set to 8.  */
2313           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2314           goto on_stack;
2315         }
2316     }
2317
2318   ncrn = pcum->aapcs_ncrn;
2319   nregs = size / UNITS_PER_WORD;
2320
2321   /* C6 - C9.  though the sign and zero extension semantics are
2322      handled elsewhere.  This is the case where the argument fits
2323      entirely general registers.  */
2324   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2325     {
2326       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2327
2328       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2329
2330       /* C.8 if the argument has an alignment of 16 then the NGRN is
2331          rounded up to the next even number.  */
2332       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2333         {
2334           ++ncrn;
2335           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2336         }
2337       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2338          A reg is still generated for it, but the caller should be smart
2339          enough not to use it.  */
2340       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2341         {
2342           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2343         }
2344       else
2345         {
2346           rtx par;
2347           int i;
2348
2349           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2350           for (i = 0; i < nregs; i++)
2351             {
2352               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2353               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2354                                        GEN_INT (i * UNITS_PER_WORD));
2355               XVECEXP (par, 0, i) = tmp;
2356             }
2357           pcum->aapcs_reg = par;
2358         }
2359
2360       pcum->aapcs_nextncrn = ncrn + nregs;
2361       return;
2362     }
2363
2364   /* C.11  */
2365   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2366
2367   /* The argument is passed on stack; record the needed number of words for
2368      this argument and align the total size if necessary.  */
2369 on_stack:
2370   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2371   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2372     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2373                                        16 / UNITS_PER_WORD);
2374   return;
2375 }
2376
2377 /* Implement TARGET_FUNCTION_ARG.  */
2378
2379 static rtx
2380 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2381                       const_tree type, bool named)
2382 {
2383   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2384   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2385
2386   if (mode == VOIDmode)
2387     return NULL_RTX;
2388
2389   aarch64_layout_arg (pcum_v, mode, type, named);
2390   return pcum->aapcs_reg;
2391 }
2392
2393 void
2394 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2395                            const_tree fntype ATTRIBUTE_UNUSED,
2396                            rtx libname ATTRIBUTE_UNUSED,
2397                            const_tree fndecl ATTRIBUTE_UNUSED,
2398                            unsigned n_named ATTRIBUTE_UNUSED)
2399 {
2400   pcum->aapcs_ncrn = 0;
2401   pcum->aapcs_nvrn = 0;
2402   pcum->aapcs_nextncrn = 0;
2403   pcum->aapcs_nextnvrn = 0;
2404   pcum->pcs_variant = ARM_PCS_AAPCS64;
2405   pcum->aapcs_reg = NULL_RTX;
2406   pcum->aapcs_arg_processed = false;
2407   pcum->aapcs_stack_words = 0;
2408   pcum->aapcs_stack_size = 0;
2409
2410   if (!TARGET_FLOAT
2411       && fndecl && TREE_PUBLIC (fndecl)
2412       && fntype && fntype != error_mark_node)
2413     {
2414       const_tree type = TREE_TYPE (fntype);
2415       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2416       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2417       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2418                                                    &mode, &nregs, NULL))
2419         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2420     }
2421   return;
2422 }
2423
2424 static void
2425 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2426                               machine_mode mode,
2427                               const_tree type,
2428                               bool named)
2429 {
2430   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2431   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2432     {
2433       aarch64_layout_arg (pcum_v, mode, type, named);
2434       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2435                   != (pcum->aapcs_stack_words != 0));
2436       pcum->aapcs_arg_processed = false;
2437       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2438       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2439       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2440       pcum->aapcs_stack_words = 0;
2441       pcum->aapcs_reg = NULL_RTX;
2442     }
2443 }
2444
2445 bool
2446 aarch64_function_arg_regno_p (unsigned regno)
2447 {
2448   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2449           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2450 }
2451
2452 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2453    PARM_BOUNDARY bits of alignment, but will be given anything up
2454    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2455    that both before and after the layout of each argument, the Next
2456    Stacked Argument Address (NSAA) will have a minimum alignment of
2457    8 bytes.  */
2458
2459 static unsigned int
2460 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2461 {
2462   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2463
2464   if (alignment < PARM_BOUNDARY)
2465     alignment = PARM_BOUNDARY;
2466   if (alignment > STACK_BOUNDARY)
2467     alignment = STACK_BOUNDARY;
2468   return alignment;
2469 }
2470
2471 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2472
2473    Return true if an argument passed on the stack should be padded upwards,
2474    i.e. if the least-significant byte of the stack slot has useful data.
2475
2476    Small aggregate types are placed in the lowest memory address.
2477
2478    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2479
2480 bool
2481 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2482 {
2483   /* On little-endian targets, the least significant byte of every stack
2484      argument is passed at the lowest byte address of the stack slot.  */
2485   if (!BYTES_BIG_ENDIAN)
2486     return true;
2487
2488   /* Otherwise, integral, floating-point and pointer types are padded downward:
2489      the least significant byte of a stack argument is passed at the highest
2490      byte address of the stack slot.  */
2491   if (type
2492       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2493          || POINTER_TYPE_P (type))
2494       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2495     return false;
2496
2497   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2498   return true;
2499 }
2500
2501 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2502
2503    It specifies padding for the last (may also be the only)
2504    element of a block move between registers and memory.  If
2505    assuming the block is in the memory, padding upward means that
2506    the last element is padded after its highest significant byte,
2507    while in downward padding, the last element is padded at the
2508    its least significant byte side.
2509
2510    Small aggregates and small complex types are always padded
2511    upwards.
2512
2513    We don't need to worry about homogeneous floating-point or
2514    short-vector aggregates; their move is not affected by the
2515    padding direction determined here.  Regardless of endianness,
2516    each element of such an aggregate is put in the least
2517    significant bits of a fp/simd register.
2518
2519    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2520    register has useful data, and return the opposite if the most
2521    significant byte does.  */
2522
2523 bool
2524 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2525                      bool first ATTRIBUTE_UNUSED)
2526 {
2527
2528   /* Small composite types are always padded upward.  */
2529   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2530     {
2531       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2532                             : GET_MODE_SIZE (mode));
2533       if (size < 2 * UNITS_PER_WORD)
2534         return true;
2535     }
2536
2537   /* Otherwise, use the default padding.  */
2538   return !BYTES_BIG_ENDIAN;
2539 }
2540
2541 static machine_mode
2542 aarch64_libgcc_cmp_return_mode (void)
2543 {
2544   return SImode;
2545 }
2546
2547 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2548
2549 /* We use the 12-bit shifted immediate arithmetic instructions so values
2550    must be multiple of (1 << 12), i.e. 4096.  */
2551 #define ARITH_FACTOR 4096
2552
2553 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2554 #error Cannot use simple address calculation for stack probing
2555 #endif
2556
2557 /* The pair of scratch registers used for stack probing.  */
2558 #define PROBE_STACK_FIRST_REG  9
2559 #define PROBE_STACK_SECOND_REG 10
2560
2561 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2562    inclusive.  These are offsets from the current stack pointer.  */
2563
2564 static void
2565 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2566 {
2567   rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2568
2569   /* See the same assertion on PROBE_INTERVAL above.  */
2570   gcc_assert ((first % ARITH_FACTOR) == 0);
2571
2572   /* See if we have a constant small number of probes to generate.  If so,
2573      that's the easy case.  */
2574   if (size <= PROBE_INTERVAL)
2575     {
2576       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2577
2578       emit_set_insn (reg1,
2579                      plus_constant (ptr_mode,
2580                                     stack_pointer_rtx, -(first + base)));
2581       emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2582     }
2583
2584   /* The run-time loop is made up of 8 insns in the generic case while the
2585      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2586   else if (size <= 4 * PROBE_INTERVAL)
2587     {
2588       HOST_WIDE_INT i, rem;
2589
2590       emit_set_insn (reg1,
2591                      plus_constant (ptr_mode,
2592                                     stack_pointer_rtx,
2593                                     -(first + PROBE_INTERVAL)));
2594       emit_stack_probe (reg1);
2595
2596       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2597          it exceeds SIZE.  If only two probes are needed, this will not
2598          generate any code.  Then probe at FIRST + SIZE.  */
2599       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2600         {
2601           emit_set_insn (reg1,
2602                          plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2603           emit_stack_probe (reg1);
2604         }
2605
2606       rem = size - (i - PROBE_INTERVAL);
2607       if (rem > 256)
2608         {
2609           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2610
2611           emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2612           emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2613         }
2614       else
2615         emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2616     }
2617
2618   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2619      extra careful with variables wrapping around because we might be at
2620      the very top (or the very bottom) of the address space and we have
2621      to be able to handle this case properly; in particular, we use an
2622      equality test for the loop condition.  */
2623   else
2624     {
2625       rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2626
2627       /* Step 1: round SIZE to the previous multiple of the interval.  */
2628
2629       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2630
2631
2632       /* Step 2: compute initial and final value of the loop counter.  */
2633
2634       /* TEST_ADDR = SP + FIRST.  */
2635       emit_set_insn (reg1,
2636                      plus_constant (ptr_mode, stack_pointer_rtx, -first));
2637
2638       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2639       emit_set_insn (reg2,
2640                      plus_constant (ptr_mode, stack_pointer_rtx,
2641                                     -(first + rounded_size)));
2642
2643
2644       /* Step 3: the loop
2645
2646          do
2647            {
2648              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2649              probe at TEST_ADDR
2650            }
2651          while (TEST_ADDR != LAST_ADDR)
2652
2653          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2654          until it is equal to ROUNDED_SIZE.  */
2655
2656       if (ptr_mode == DImode)
2657         emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2658       else
2659         emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2660
2661
2662       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2663          that SIZE is equal to ROUNDED_SIZE.  */
2664
2665       if (size != rounded_size)
2666         {
2667           HOST_WIDE_INT rem = size - rounded_size;
2668
2669           if (rem > 256)
2670             {
2671               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2672
2673               emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2674               emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2675             }
2676           else
2677             emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2678         }
2679     }
2680
2681   /* Make sure nothing is scheduled before we are done.  */
2682   emit_insn (gen_blockage ());
2683 }
2684
2685 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2686    absolute addresses.  */
2687
2688 const char *
2689 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2690 {
2691   static int labelno = 0;
2692   char loop_lab[32];
2693   rtx xops[2];
2694
2695   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2696
2697   /* Loop.  */
2698   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2699
2700   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2701   xops[0] = reg1;
2702   xops[1] = GEN_INT (PROBE_INTERVAL);
2703   output_asm_insn ("sub\t%0, %0, %1", xops);
2704
2705   /* Probe at TEST_ADDR.  */
2706   output_asm_insn ("str\txzr, [%0]", xops);
2707
2708   /* Test if TEST_ADDR == LAST_ADDR.  */
2709   xops[1] = reg2;
2710   output_asm_insn ("cmp\t%0, %1", xops);
2711
2712   /* Branch.  */
2713   fputs ("\tb.ne\t", asm_out_file);
2714   assemble_name_raw (asm_out_file, loop_lab);
2715   fputc ('\n', asm_out_file);
2716
2717   return "";
2718 }
2719
2720 static bool
2721 aarch64_frame_pointer_required (void)
2722 {
2723   /* In aarch64_override_options_after_change
2724      flag_omit_leaf_frame_pointer turns off the frame pointer by
2725      default.  Turn it back on now if we've not got a leaf
2726      function.  */
2727   if (flag_omit_leaf_frame_pointer
2728       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2729     return true;
2730
2731   return false;
2732 }
2733
2734 /* Mark the registers that need to be saved by the callee and calculate
2735    the size of the callee-saved registers area and frame record (both FP
2736    and LR may be omitted).  */
2737 static void
2738 aarch64_layout_frame (void)
2739 {
2740   HOST_WIDE_INT offset = 0;
2741   int regno;
2742
2743   if (reload_completed && cfun->machine->frame.laid_out)
2744     return;
2745
2746 #define SLOT_NOT_REQUIRED (-2)
2747 #define SLOT_REQUIRED     (-1)
2748
2749   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2750   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2751
2752   /* First mark all the registers that really need to be saved...  */
2753   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2754     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2755
2756   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2757     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2758
2759   /* ... that includes the eh data registers (if needed)...  */
2760   if (crtl->calls_eh_return)
2761     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2762       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2763         = SLOT_REQUIRED;
2764
2765   /* ... and any callee saved register that dataflow says is live.  */
2766   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2767     if (df_regs_ever_live_p (regno)
2768         && (regno == R30_REGNUM
2769             || !call_used_regs[regno]))
2770       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2771
2772   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2773     if (df_regs_ever_live_p (regno)
2774         && !call_used_regs[regno])
2775       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2776
2777   if (frame_pointer_needed)
2778     {
2779       /* FP and LR are placed in the linkage record.  */
2780       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2781       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2782       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2783       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2784       offset += 2 * UNITS_PER_WORD;
2785     }
2786
2787   /* Now assign stack slots for them.  */
2788   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2789     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2790       {
2791         cfun->machine->frame.reg_offset[regno] = offset;
2792         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2793           cfun->machine->frame.wb_candidate1 = regno;
2794         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2795           cfun->machine->frame.wb_candidate2 = regno;
2796         offset += UNITS_PER_WORD;
2797       }
2798
2799   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2800     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2801       {
2802         cfun->machine->frame.reg_offset[regno] = offset;
2803         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2804           cfun->machine->frame.wb_candidate1 = regno;
2805         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2806                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2807           cfun->machine->frame.wb_candidate2 = regno;
2808         offset += UNITS_PER_WORD;
2809       }
2810
2811   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2812
2813   cfun->machine->frame.saved_regs_size = offset;
2814
2815   HOST_WIDE_INT varargs_and_saved_regs_size
2816     = offset + cfun->machine->frame.saved_varargs_size;
2817
2818   cfun->machine->frame.hard_fp_offset
2819     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2820                 STACK_BOUNDARY / BITS_PER_UNIT);
2821
2822   cfun->machine->frame.frame_size
2823     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2824                 + crtl->outgoing_args_size,
2825                 STACK_BOUNDARY / BITS_PER_UNIT);
2826
2827   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2828
2829   cfun->machine->frame.initial_adjust = 0;
2830   cfun->machine->frame.final_adjust = 0;
2831   cfun->machine->frame.callee_adjust = 0;
2832   cfun->machine->frame.callee_offset = 0;
2833
2834   HOST_WIDE_INT max_push_offset = 0;
2835   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2836     max_push_offset = 512;
2837   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2838     max_push_offset = 256;
2839
2840   if (cfun->machine->frame.frame_size < max_push_offset
2841       && crtl->outgoing_args_size == 0)
2842     {
2843       /* Simple, small frame with no outgoing arguments:
2844          stp reg1, reg2, [sp, -frame_size]!
2845          stp reg3, reg4, [sp, 16]  */
2846       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2847     }
2848   else if ((crtl->outgoing_args_size
2849             + cfun->machine->frame.saved_regs_size < 512)
2850            && !(cfun->calls_alloca
2851                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2852     {
2853       /* Frame with small outgoing arguments:
2854          sub sp, sp, frame_size
2855          stp reg1, reg2, [sp, outgoing_args_size]
2856          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
2857       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2858       cfun->machine->frame.callee_offset
2859         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2860     }
2861   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2862     {
2863       /* Frame with large outgoing arguments but a small local area:
2864          stp reg1, reg2, [sp, -hard_fp_offset]!
2865          stp reg3, reg4, [sp, 16]
2866          sub sp, sp, outgoing_args_size  */
2867       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2868       cfun->machine->frame.final_adjust
2869         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2870     }
2871   else if (!frame_pointer_needed
2872            && varargs_and_saved_regs_size < max_push_offset)
2873     {
2874       /* Frame with large local area and outgoing arguments (this pushes the
2875          callee-saves first, followed by the locals and outgoing area):
2876          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2877          stp reg3, reg4, [sp, 16]
2878          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
2879       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2880       cfun->machine->frame.final_adjust
2881         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2882       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2883       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2884     }
2885   else
2886     {
2887       /* Frame with large local area and outgoing arguments using frame pointer:
2888          sub sp, sp, hard_fp_offset
2889          stp x29, x30, [sp, 0]
2890          add x29, sp, 0
2891          stp reg3, reg4, [sp, 16]
2892          sub sp, sp, outgoing_args_size  */
2893       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2894       cfun->machine->frame.final_adjust
2895         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2896     }
2897
2898   cfun->machine->frame.laid_out = true;
2899 }
2900
2901 static bool
2902 aarch64_register_saved_on_entry (int regno)
2903 {
2904   return cfun->machine->frame.reg_offset[regno] >= 0;
2905 }
2906
2907 static unsigned
2908 aarch64_next_callee_save (unsigned regno, unsigned limit)
2909 {
2910   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2911     regno ++;
2912   return regno;
2913 }
2914
2915 static void
2916 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2917                            HOST_WIDE_INT adjustment)
2918  {
2919   rtx base_rtx = stack_pointer_rtx;
2920   rtx insn, reg, mem;
2921
2922   reg = gen_rtx_REG (mode, regno);
2923   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2924                             plus_constant (Pmode, base_rtx, -adjustment));
2925   mem = gen_rtx_MEM (mode, mem);
2926
2927   insn = emit_move_insn (mem, reg);
2928   RTX_FRAME_RELATED_P (insn) = 1;
2929 }
2930
2931 static rtx
2932 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2933                           HOST_WIDE_INT adjustment)
2934 {
2935   switch (mode)
2936     {
2937     case DImode:
2938       return gen_storewb_pairdi_di (base, base, reg, reg2,
2939                                     GEN_INT (-adjustment),
2940                                     GEN_INT (UNITS_PER_WORD - adjustment));
2941     case DFmode:
2942       return gen_storewb_pairdf_di (base, base, reg, reg2,
2943                                     GEN_INT (-adjustment),
2944                                     GEN_INT (UNITS_PER_WORD - adjustment));
2945     default:
2946       gcc_unreachable ();
2947     }
2948 }
2949
2950 static void
2951 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
2952 {
2953   rtx_insn *insn;
2954   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
2955
2956   if (regno2 == INVALID_REGNUM)
2957     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
2958
2959   rtx reg1 = gen_rtx_REG (mode, regno1);
2960   rtx reg2 = gen_rtx_REG (mode, regno2);
2961
2962   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2963                                               reg2, adjustment));
2964   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2965   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2966   RTX_FRAME_RELATED_P (insn) = 1;
2967 }
2968
2969 static rtx
2970 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2971                          HOST_WIDE_INT adjustment)
2972 {
2973   switch (mode)
2974     {
2975     case DImode:
2976       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2977                                    GEN_INT (UNITS_PER_WORD));
2978     case DFmode:
2979       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2980                                    GEN_INT (UNITS_PER_WORD));
2981     default:
2982       gcc_unreachable ();
2983     }
2984 }
2985
2986 static void
2987 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
2988                   rtx *cfi_ops)
2989 {
2990   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
2991   rtx reg1 = gen_rtx_REG (mode, regno1);
2992
2993   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
2994
2995   if (regno2 == INVALID_REGNUM)
2996     {
2997       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
2998       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2999       emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3000     }
3001   else
3002     {
3003       rtx reg2 = gen_rtx_REG (mode, regno2);
3004       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3005       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3006                                           reg2, adjustment));
3007     }
3008 }
3009
3010 static rtx
3011 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3012                         rtx reg2)
3013 {
3014   switch (mode)
3015     {
3016     case DImode:
3017       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3018
3019     case DFmode:
3020       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3021
3022     default:
3023       gcc_unreachable ();
3024     }
3025 }
3026
3027 static rtx
3028 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3029                        rtx mem2)
3030 {
3031   switch (mode)
3032     {
3033     case DImode:
3034       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3035
3036     case DFmode:
3037       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3038
3039     default:
3040       gcc_unreachable ();
3041     }
3042 }
3043
3044
3045 static void
3046 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3047                            unsigned start, unsigned limit, bool skip_wb)
3048 {
3049   rtx_insn *insn;
3050   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3051                                                  ? gen_frame_mem : gen_rtx_MEM);
3052   unsigned regno;
3053   unsigned regno2;
3054
3055   for (regno = aarch64_next_callee_save (start, limit);
3056        regno <= limit;
3057        regno = aarch64_next_callee_save (regno + 1, limit))
3058     {
3059       rtx reg, mem;
3060       HOST_WIDE_INT offset;
3061
3062       if (skip_wb
3063           && (regno == cfun->machine->frame.wb_candidate1
3064               || regno == cfun->machine->frame.wb_candidate2))
3065         continue;
3066
3067       reg = gen_rtx_REG (mode, regno);
3068       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3069       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3070                                               offset));
3071
3072       regno2 = aarch64_next_callee_save (regno + 1, limit);
3073
3074       if (regno2 <= limit
3075           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3076               == cfun->machine->frame.reg_offset[regno2]))
3077
3078         {
3079           rtx reg2 = gen_rtx_REG (mode, regno2);
3080           rtx mem2;
3081
3082           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3083           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3084                                                    offset));
3085           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3086                                                     reg2));
3087
3088           /* The first part of a frame-related parallel insn is
3089              always assumed to be relevant to the frame
3090              calculations; subsequent parts, are only
3091              frame-related if explicitly marked.  */
3092           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3093           regno = regno2;
3094         }
3095       else
3096         insn = emit_move_insn (mem, reg);
3097
3098       RTX_FRAME_RELATED_P (insn) = 1;
3099     }
3100 }
3101
3102 static void
3103 aarch64_restore_callee_saves (machine_mode mode,
3104                               HOST_WIDE_INT start_offset, unsigned start,
3105                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3106 {
3107   rtx base_rtx = stack_pointer_rtx;
3108   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3109                                                  ? gen_frame_mem : gen_rtx_MEM);
3110   unsigned regno;
3111   unsigned regno2;
3112   HOST_WIDE_INT offset;
3113
3114   for (regno = aarch64_next_callee_save (start, limit);
3115        regno <= limit;
3116        regno = aarch64_next_callee_save (regno + 1, limit))
3117     {
3118       rtx reg, mem;
3119
3120       if (skip_wb
3121           && (regno == cfun->machine->frame.wb_candidate1
3122               || regno == cfun->machine->frame.wb_candidate2))
3123         continue;
3124
3125       reg = gen_rtx_REG (mode, regno);
3126       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3127       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3128
3129       regno2 = aarch64_next_callee_save (regno + 1, limit);
3130
3131       if (regno2 <= limit
3132           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3133               == cfun->machine->frame.reg_offset[regno2]))
3134         {
3135           rtx reg2 = gen_rtx_REG (mode, regno2);
3136           rtx mem2;
3137
3138           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3139           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3140           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3141
3142           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3143           regno = regno2;
3144         }
3145       else
3146         emit_move_insn (reg, mem);
3147       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3148     }
3149 }
3150
3151 /* AArch64 stack frames generated by this compiler look like:
3152
3153         +-------------------------------+
3154         |                               |
3155         |  incoming stack arguments     |
3156         |                               |
3157         +-------------------------------+
3158         |                               | <-- incoming stack pointer (aligned)
3159         |  callee-allocated save area   |
3160         |  for register varargs         |
3161         |                               |
3162         +-------------------------------+
3163         |  local variables              | <-- frame_pointer_rtx
3164         |                               |
3165         +-------------------------------+
3166         |  padding0                     | \
3167         +-------------------------------+  |
3168         |  callee-saved registers       |  | frame.saved_regs_size
3169         +-------------------------------+  |
3170         |  LR'                          |  |
3171         +-------------------------------+  |
3172         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3173         +-------------------------------+
3174         |  dynamic allocation           |
3175         +-------------------------------+
3176         |  padding                      |
3177         +-------------------------------+
3178         |  outgoing stack arguments     | <-- arg_pointer
3179         |                               |
3180         +-------------------------------+
3181         |                               | <-- stack_pointer_rtx (aligned)
3182
3183    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3184    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3185    unchanged.  */
3186
3187 /* Generate the prologue instructions for entry into a function.
3188    Establish the stack frame by decreasing the stack pointer with a
3189    properly calculated size and, if necessary, create a frame record
3190    filled with the values of LR and previous frame pointer.  The
3191    current FP is also set up if it is in use.  */
3192
3193 void
3194 aarch64_expand_prologue (void)
3195 {
3196   aarch64_layout_frame ();
3197
3198   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3199   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3200   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3201   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3202   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3203   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3204   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3205   rtx_insn *insn;
3206
3207   if (flag_stack_usage_info)
3208     current_function_static_stack_size = frame_size;
3209
3210   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3211     {
3212       if (crtl->is_leaf && !cfun->calls_alloca)
3213         {
3214           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3215             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3216                                             frame_size - STACK_CHECK_PROTECT);
3217         }
3218       else if (frame_size > 0)
3219         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3220     }
3221
3222   aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -initial_adjust, true);
3223
3224   if (callee_adjust != 0)
3225     aarch64_push_regs (reg1, reg2, callee_adjust);
3226
3227   if (frame_pointer_needed)
3228     {
3229       if (callee_adjust == 0)
3230         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3231                                    R30_REGNUM, false);
3232       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3233                                        stack_pointer_rtx,
3234                                        GEN_INT (callee_offset)));
3235       RTX_FRAME_RELATED_P (insn) = 1;
3236       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3237     }
3238
3239   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3240                              callee_adjust != 0 || frame_pointer_needed);
3241   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3242                              callee_adjust != 0 || frame_pointer_needed);
3243   aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, -final_adjust,
3244                         !frame_pointer_needed);
3245 }
3246
3247 /* Return TRUE if we can use a simple_return insn.
3248
3249    This function checks whether the callee saved stack is empty, which
3250    means no restore actions are need. The pro_and_epilogue will use
3251    this to check whether shrink-wrapping opt is feasible.  */
3252
3253 bool
3254 aarch64_use_return_insn_p (void)
3255 {
3256   if (!reload_completed)
3257     return false;
3258
3259   if (crtl->profile)
3260     return false;
3261
3262   aarch64_layout_frame ();
3263
3264   return cfun->machine->frame.frame_size == 0;
3265 }
3266
3267 /* Generate the epilogue instructions for returning from a function.
3268    This is almost exactly the reverse of the prolog sequence, except
3269    that we need to insert barriers to avoid scheduling loads that read
3270    from a deallocated stack, and we optimize the unwind records by
3271    emitting them all together if possible.  */
3272 void
3273 aarch64_expand_epilogue (bool for_sibcall)
3274 {
3275   aarch64_layout_frame ();
3276
3277   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3278   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3279   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3280   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3281   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3282   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3283   rtx cfi_ops = NULL;
3284   rtx_insn *insn;
3285
3286   /* We need to add memory barrier to prevent read from deallocated stack.  */
3287   bool need_barrier_p = (get_frame_size ()
3288                          + cfun->machine->frame.saved_varargs_size) != 0;
3289
3290   /* Emit a barrier to prevent loads from a deallocated stack.  */
3291   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca)
3292     {
3293       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3294       need_barrier_p = false;
3295     }
3296
3297   /* Restore the stack pointer from the frame pointer if it may not
3298      be the same as the stack pointer.  */
3299   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3300     {
3301       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3302                                        hard_frame_pointer_rtx,
3303                                        GEN_INT (-callee_offset)));
3304       /* If writeback is used when restoring callee-saves, the CFA
3305          is restored on the instruction doing the writeback.  */
3306       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3307     }
3308   else
3309     aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, final_adjust, true);
3310
3311   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3312                                 callee_adjust != 0, &cfi_ops);
3313   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3314                                 callee_adjust != 0, &cfi_ops);
3315
3316   if (need_barrier_p)
3317     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3318
3319   if (callee_adjust != 0)
3320     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3321
3322   if (callee_adjust != 0 || initial_adjust > 65536)
3323     {
3324       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3325       insn = get_last_insn ();
3326       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3327       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3328       RTX_FRAME_RELATED_P (insn) = 1;
3329       cfi_ops = NULL;
3330     }
3331
3332   aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, initial_adjust, true);
3333
3334   if (cfi_ops)
3335     {
3336       /* Emit delayed restores and reset the CFA to be SP.  */
3337       insn = get_last_insn ();
3338       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3339       REG_NOTES (insn) = cfi_ops;
3340       RTX_FRAME_RELATED_P (insn) = 1;
3341     }
3342
3343   /* Stack adjustment for exception handler.  */
3344   if (crtl->calls_eh_return)
3345     {
3346       /* We need to unwind the stack by the offset computed by
3347          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3348          to be SP; letting the CFA move during this adjustment
3349          is just as correct as retaining the CFA from the body
3350          of the function.  Therefore, do nothing special.  */
3351       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3352     }
3353
3354   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3355   if (!for_sibcall)
3356     emit_jump_insn (ret_rtx);
3357 }
3358
3359 /* Return the place to copy the exception unwinding return address to.
3360    This will probably be a stack slot, but could (in theory be the
3361    return register).  */
3362 rtx
3363 aarch64_final_eh_return_addr (void)
3364 {
3365   HOST_WIDE_INT fp_offset;
3366
3367   aarch64_layout_frame ();
3368
3369   fp_offset = cfun->machine->frame.frame_size
3370               - cfun->machine->frame.hard_fp_offset;
3371
3372   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
3373     return gen_rtx_REG (DImode, LR_REGNUM);
3374
3375   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
3376      result in a store to save LR introduced by builtin_eh_return () being
3377      incorrectly deleted because the alias is not detected.
3378      So in the calculation of the address to copy the exception unwinding
3379      return address to, we note 2 cases.
3380      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
3381      we return a SP-relative location since all the addresses are SP-relative
3382      in this case.  This prevents the store from being optimized away.
3383      If the fp_offset is not 0, then the addresses will be FP-relative and
3384      therefore we return a FP-relative location.  */
3385
3386   if (frame_pointer_needed)
3387     {
3388       if (fp_offset)
3389         return gen_frame_mem (DImode,
3390                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3391       else
3392         return gen_frame_mem (DImode,
3393                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
3394     }
3395
3396   /* If FP is not needed, we calculate the location of LR, which would be
3397      at the top of the saved registers block.  */
3398
3399   return gen_frame_mem (DImode,
3400                         plus_constant (Pmode,
3401                                        stack_pointer_rtx,
3402                                        fp_offset
3403                                        + cfun->machine->frame.saved_regs_size
3404                                        - 2 * UNITS_PER_WORD));
3405 }
3406
3407 /* Output code to add DELTA to the first argument, and then jump
3408    to FUNCTION.  Used for C++ multiple inheritance.  */
3409 static void
3410 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3411                          HOST_WIDE_INT delta,
3412                          HOST_WIDE_INT vcall_offset,
3413                          tree function)
3414 {
3415   /* The this pointer is always in x0.  Note that this differs from
3416      Arm where the this pointer maybe bumped to r1 if r0 is required
3417      to return a pointer to an aggregate.  On AArch64 a result value
3418      pointer will be in x8.  */
3419   int this_regno = R0_REGNUM;
3420   rtx this_rtx, temp0, temp1, addr, funexp;
3421   rtx_insn *insn;
3422
3423   reload_completed = 1;
3424   emit_note (NOTE_INSN_PROLOGUE_END);
3425
3426   if (vcall_offset == 0)
3427     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta, false);
3428   else
3429     {
3430       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3431
3432       this_rtx = gen_rtx_REG (Pmode, this_regno);
3433       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3434       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3435
3436       addr = this_rtx;
3437       if (delta != 0)
3438         {
3439           if (delta >= -256 && delta < 256)
3440             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3441                                        plus_constant (Pmode, this_rtx, delta));
3442           else
3443             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta, false);
3444         }
3445
3446       if (Pmode == ptr_mode)
3447         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3448       else
3449         aarch64_emit_move (temp0,
3450                            gen_rtx_ZERO_EXTEND (Pmode,
3451                                                 gen_rtx_MEM (ptr_mode, addr)));
3452
3453       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3454           addr = plus_constant (Pmode, temp0, vcall_offset);
3455       else
3456         {
3457           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3458                                           Pmode);
3459           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3460         }
3461
3462       if (Pmode == ptr_mode)
3463         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3464       else
3465         aarch64_emit_move (temp1,
3466                            gen_rtx_SIGN_EXTEND (Pmode,
3467                                                 gen_rtx_MEM (ptr_mode, addr)));
3468
3469       emit_insn (gen_add2_insn (this_rtx, temp1));
3470     }
3471
3472   /* Generate a tail call to the target function.  */
3473   if (!TREE_USED (function))
3474     {
3475       assemble_external (function);
3476       TREE_USED (function) = 1;
3477     }
3478   funexp = XEXP (DECL_RTL (function), 0);
3479   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3480   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3481   SIBLING_CALL_P (insn) = 1;
3482
3483   insn = get_insns ();
3484   shorten_branches (insn);
3485   final_start_function (insn, file, 1);
3486   final (insn, file, 1);
3487   final_end_function ();
3488
3489   /* Stop pretending to be a post-reload pass.  */
3490   reload_completed = 0;
3491 }
3492
3493 static bool
3494 aarch64_tls_referenced_p (rtx x)
3495 {
3496   if (!TARGET_HAVE_TLS)
3497     return false;
3498   subrtx_iterator::array_type array;
3499   FOR_EACH_SUBRTX (iter, array, x, ALL)
3500     {
3501       const_rtx x = *iter;
3502       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3503         return true;
3504       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3505          TLS offsets, not real symbol references.  */
3506       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3507         iter.skip_subrtxes ();
3508     }
3509   return false;
3510 }
3511
3512
3513 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3514    a left shift of 0 or 12 bits.  */
3515 bool
3516 aarch64_uimm12_shift (HOST_WIDE_INT val)
3517 {
3518   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3519           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3520           );
3521 }
3522
3523
3524 /* Return true if val is an immediate that can be loaded into a
3525    register by a MOVZ instruction.  */
3526 static bool
3527 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3528 {
3529   if (GET_MODE_SIZE (mode) > 4)
3530     {
3531       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3532           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3533         return 1;
3534     }
3535   else
3536     {
3537       /* Ignore sign extension.  */
3538       val &= (HOST_WIDE_INT) 0xffffffff;
3539     }
3540   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3541           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3542 }
3543
3544 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3545
3546 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3547   {
3548     0x0000000100000001ull,
3549     0x0001000100010001ull,
3550     0x0101010101010101ull,
3551     0x1111111111111111ull,
3552     0x5555555555555555ull,
3553   };
3554
3555
3556 /* Return true if val is a valid bitmask immediate.  */
3557
3558 bool
3559 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3560 {
3561   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3562   int bits;
3563
3564   /* Check for a single sequence of one bits and return quickly if so.
3565      The special cases of all ones and all zeroes returns false.  */
3566   val = (unsigned HOST_WIDE_INT) val_in;
3567   tmp = val + (val & -val);
3568
3569   if (tmp == (tmp & -tmp))
3570     return (val + 1) > 1;
3571
3572   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
3573   if (mode == SImode)
3574     val = (val << 32) | (val & 0xffffffff);
3575
3576   /* Invert if the immediate doesn't start with a zero bit - this means we
3577      only need to search for sequences of one bits.  */
3578   if (val & 1)
3579     val = ~val;
3580
3581   /* Find the first set bit and set tmp to val with the first sequence of one
3582      bits removed.  Return success if there is a single sequence of ones.  */
3583   first_one = val & -val;
3584   tmp = val & (val + first_one);
3585
3586   if (tmp == 0)
3587     return true;
3588
3589   /* Find the next set bit and compute the difference in bit position.  */
3590   next_one = tmp & -tmp;
3591   bits = clz_hwi (first_one) - clz_hwi (next_one);
3592   mask = val ^ tmp;
3593
3594   /* Check the bit position difference is a power of 2, and that the first
3595      sequence of one bits fits within 'bits' bits.  */
3596   if ((mask >> bits) != 0 || bits != (bits & -bits))
3597     return false;
3598
3599   /* Check the sequence of one bits is repeated 64/bits times.  */
3600   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3601 }
3602
3603
3604 /* Return true if val is an immediate that can be loaded into a
3605    register in a single instruction.  */
3606 bool
3607 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3608 {
3609   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3610     return 1;
3611   return aarch64_bitmask_imm (val, mode);
3612 }
3613
3614 static bool
3615 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3616 {
3617   rtx base, offset;
3618
3619   if (GET_CODE (x) == HIGH)
3620     return true;
3621
3622   split_const (x, &base, &offset);
3623   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3624     {
3625       if (aarch64_classify_symbol (base, offset)
3626           != SYMBOL_FORCE_TO_MEM)
3627         return true;
3628       else
3629         /* Avoid generating a 64-bit relocation in ILP32; leave
3630            to aarch64_expand_mov_immediate to handle it properly.  */
3631         return mode != ptr_mode;
3632     }
3633
3634   return aarch64_tls_referenced_p (x);
3635 }
3636
3637 /* Implement TARGET_CASE_VALUES_THRESHOLD.
3638    The expansion for a table switch is quite expensive due to the number
3639    of instructions, the table lookup and hard to predict indirect jump.
3640    When optimizing for speed, and -O3 enabled, use the per-core tuning if
3641    set, otherwise use tables for > 16 cases as a tradeoff between size and
3642    performance.  When optimizing for size, use the default setting.  */
3643
3644 static unsigned int
3645 aarch64_case_values_threshold (void)
3646 {
3647   /* Use the specified limit for the number of cases before using jump
3648      tables at higher optimization levels.  */
3649   if (optimize > 2
3650       && selected_cpu->tune->max_case_values != 0)
3651     return selected_cpu->tune->max_case_values;
3652   else
3653     return optimize_size ? default_case_values_threshold () : 17;
3654 }
3655
3656 /* Return true if register REGNO is a valid index register.
3657    STRICT_P is true if REG_OK_STRICT is in effect.  */
3658
3659 bool
3660 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3661 {
3662   if (!HARD_REGISTER_NUM_P (regno))
3663     {
3664       if (!strict_p)
3665         return true;
3666
3667       if (!reg_renumber)
3668         return false;
3669
3670       regno = reg_renumber[regno];
3671     }
3672   return GP_REGNUM_P (regno);
3673 }
3674
3675 /* Return true if register REGNO is a valid base register for mode MODE.
3676    STRICT_P is true if REG_OK_STRICT is in effect.  */
3677
3678 bool
3679 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3680 {
3681   if (!HARD_REGISTER_NUM_P (regno))
3682     {
3683       if (!strict_p)
3684         return true;
3685
3686       if (!reg_renumber)
3687         return false;
3688
3689       regno = reg_renumber[regno];
3690     }
3691
3692   /* The fake registers will be eliminated to either the stack or
3693      hard frame pointer, both of which are usually valid base registers.
3694      Reload deals with the cases where the eliminated form isn't valid.  */
3695   return (GP_REGNUM_P (regno)
3696           || regno == SP_REGNUM
3697           || regno == FRAME_POINTER_REGNUM
3698           || regno == ARG_POINTER_REGNUM);
3699 }
3700
3701 /* Return true if X is a valid base register for mode MODE.
3702    STRICT_P is true if REG_OK_STRICT is in effect.  */
3703
3704 static bool
3705 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3706 {
3707   if (!strict_p && GET_CODE (x) == SUBREG)
3708     x = SUBREG_REG (x);
3709
3710   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3711 }
3712
3713 /* Return true if address offset is a valid index.  If it is, fill in INFO
3714    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3715
3716 static bool
3717 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3718                         machine_mode mode, bool strict_p)
3719 {
3720   enum aarch64_address_type type;
3721   rtx index;
3722   int shift;
3723
3724   /* (reg:P) */
3725   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3726       && GET_MODE (x) == Pmode)
3727     {
3728       type = ADDRESS_REG_REG;
3729       index = x;
3730       shift = 0;
3731     }
3732   /* (sign_extend:DI (reg:SI)) */
3733   else if ((GET_CODE (x) == SIGN_EXTEND
3734             || GET_CODE (x) == ZERO_EXTEND)
3735            && GET_MODE (x) == DImode
3736            && GET_MODE (XEXP (x, 0)) == SImode)
3737     {
3738       type = (GET_CODE (x) == SIGN_EXTEND)
3739         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3740       index = XEXP (x, 0);
3741       shift = 0;
3742     }
3743   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3744   else if (GET_CODE (x) == MULT
3745            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3746                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3747            && GET_MODE (XEXP (x, 0)) == DImode
3748            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3749            && CONST_INT_P (XEXP (x, 1)))
3750     {
3751       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3752         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3753       index = XEXP (XEXP (x, 0), 0);
3754       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3755     }
3756   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3757   else if (GET_CODE (x) == ASHIFT
3758            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3759                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3760            && GET_MODE (XEXP (x, 0)) == DImode
3761            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3762            && CONST_INT_P (XEXP (x, 1)))
3763     {
3764       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3765         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3766       index = XEXP (XEXP (x, 0), 0);
3767       shift = INTVAL (XEXP (x, 1));
3768     }
3769   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3770   else if ((GET_CODE (x) == SIGN_EXTRACT
3771             || GET_CODE (x) == ZERO_EXTRACT)
3772            && GET_MODE (x) == DImode
3773            && GET_CODE (XEXP (x, 0)) == MULT
3774            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3775            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3776     {
3777       type = (GET_CODE (x) == SIGN_EXTRACT)
3778         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3779       index = XEXP (XEXP (x, 0), 0);
3780       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3781       if (INTVAL (XEXP (x, 1)) != 32 + shift
3782           || INTVAL (XEXP (x, 2)) != 0)
3783         shift = -1;
3784     }
3785   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3786      (const_int 0xffffffff<<shift)) */
3787   else if (GET_CODE (x) == AND
3788            && GET_MODE (x) == DImode
3789            && GET_CODE (XEXP (x, 0)) == MULT
3790            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3791            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3792            && CONST_INT_P (XEXP (x, 1)))
3793     {
3794       type = ADDRESS_REG_UXTW;
3795       index = XEXP (XEXP (x, 0), 0);
3796       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3797       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3798         shift = -1;
3799     }
3800   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3801   else if ((GET_CODE (x) == SIGN_EXTRACT
3802             || GET_CODE (x) == ZERO_EXTRACT)
3803            && GET_MODE (x) == DImode
3804            && GET_CODE (XEXP (x, 0)) == ASHIFT
3805            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3806            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3807     {
3808       type = (GET_CODE (x) == SIGN_EXTRACT)
3809         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3810       index = XEXP (XEXP (x, 0), 0);
3811       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3812       if (INTVAL (XEXP (x, 1)) != 32 + shift
3813           || INTVAL (XEXP (x, 2)) != 0)
3814         shift = -1;
3815     }
3816   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3817      (const_int 0xffffffff<<shift)) */
3818   else if (GET_CODE (x) == AND
3819            && GET_MODE (x) == DImode
3820            && GET_CODE (XEXP (x, 0)) == ASHIFT
3821            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3822            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3823            && CONST_INT_P (XEXP (x, 1)))
3824     {
3825       type = ADDRESS_REG_UXTW;
3826       index = XEXP (XEXP (x, 0), 0);
3827       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3828       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3829         shift = -1;
3830     }
3831   /* (mult:P (reg:P) (const_int scale)) */
3832   else if (GET_CODE (x) == MULT
3833            && GET_MODE (x) == Pmode
3834            && GET_MODE (XEXP (x, 0)) == Pmode
3835            && CONST_INT_P (XEXP (x, 1)))
3836     {
3837       type = ADDRESS_REG_REG;
3838       index = XEXP (x, 0);
3839       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3840     }
3841   /* (ashift:P (reg:P) (const_int shift)) */
3842   else if (GET_CODE (x) == ASHIFT
3843            && GET_MODE (x) == Pmode
3844            && GET_MODE (XEXP (x, 0)) == Pmode
3845            && CONST_INT_P (XEXP (x, 1)))
3846     {
3847       type = ADDRESS_REG_REG;
3848       index = XEXP (x, 0);
3849       shift = INTVAL (XEXP (x, 1));
3850     }
3851   else
3852     return false;
3853
3854   if (GET_CODE (index) == SUBREG)
3855     index = SUBREG_REG (index);
3856
3857   if ((shift == 0 ||
3858        (shift > 0 && shift <= 3
3859         && (1 << shift) == GET_MODE_SIZE (mode)))
3860       && REG_P (index)
3861       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3862     {
3863       info->type = type;
3864       info->offset = index;
3865       info->shift = shift;
3866       return true;
3867     }
3868
3869   return false;
3870 }
3871
3872 bool
3873 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3874 {
3875   return (offset >= -64 * GET_MODE_SIZE (mode)
3876           && offset < 64 * GET_MODE_SIZE (mode)
3877           && offset % GET_MODE_SIZE (mode) == 0);
3878 }
3879
3880 static inline bool
3881 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3882                                HOST_WIDE_INT offset)
3883 {
3884   return offset >= -256 && offset < 256;
3885 }
3886
3887 static inline bool
3888 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3889 {
3890   return (offset >= 0
3891           && offset < 4096 * GET_MODE_SIZE (mode)
3892           && offset % GET_MODE_SIZE (mode) == 0);
3893 }
3894
3895 /* Return true if MODE is one of the modes for which we
3896    support LDP/STP operations.  */
3897
3898 static bool
3899 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
3900 {
3901   return mode == SImode || mode == DImode
3902          || mode == SFmode || mode == DFmode
3903          || (aarch64_vector_mode_supported_p (mode)
3904              && GET_MODE_SIZE (mode) == 8);
3905 }
3906
3907 /* Return true if REGNO is a virtual pointer register, or an eliminable
3908    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
3909    include stack_pointer or hard_frame_pointer.  */
3910 static bool
3911 virt_or_elim_regno_p (unsigned regno)
3912 {
3913   return ((regno >= FIRST_VIRTUAL_REGISTER
3914            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
3915           || regno == FRAME_POINTER_REGNUM
3916           || regno == ARG_POINTER_REGNUM);
3917 }
3918
3919 /* Return true if X is a valid address for machine mode MODE.  If it is,
3920    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3921    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3922
3923 static bool
3924 aarch64_classify_address (struct aarch64_address_info *info,
3925                           rtx x, machine_mode mode,
3926                           RTX_CODE outer_code, bool strict_p)
3927 {
3928   enum rtx_code code = GET_CODE (x);
3929   rtx op0, op1;
3930
3931   /* On BE, we use load/store pair for all large int mode load/stores.  */
3932   bool load_store_pair_p = (outer_code == PARALLEL
3933                             || (BYTES_BIG_ENDIAN
3934                                 && aarch64_vect_struct_mode_p (mode)));
3935
3936   bool allow_reg_index_p =
3937     !load_store_pair_p
3938     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3939     && !aarch64_vect_struct_mode_p (mode);
3940
3941   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3942      REG addressing.  */
3943   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3944       && (code != POST_INC && code != REG))
3945     return false;
3946
3947   switch (code)
3948     {
3949     case REG:
3950     case SUBREG:
3951       info->type = ADDRESS_REG_IMM;
3952       info->base = x;
3953       info->offset = const0_rtx;
3954       return aarch64_base_register_rtx_p (x, strict_p);
3955
3956     case PLUS:
3957       op0 = XEXP (x, 0);
3958       op1 = XEXP (x, 1);
3959
3960       if (! strict_p
3961           && REG_P (op0)
3962           && virt_or_elim_regno_p (REGNO (op0))
3963           && CONST_INT_P (op1))
3964         {
3965           info->type = ADDRESS_REG_IMM;
3966           info->base = op0;
3967           info->offset = op1;
3968
3969           return true;
3970         }
3971
3972       if (GET_MODE_SIZE (mode) != 0
3973           && CONST_INT_P (op1)
3974           && aarch64_base_register_rtx_p (op0, strict_p))
3975         {
3976           HOST_WIDE_INT offset = INTVAL (op1);
3977
3978           info->type = ADDRESS_REG_IMM;
3979           info->base = op0;
3980           info->offset = op1;
3981
3982           /* TImode and TFmode values are allowed in both pairs of X
3983              registers and individual Q registers.  The available
3984              address modes are:
3985              X,X: 7-bit signed scaled offset
3986              Q:   9-bit signed offset
3987              We conservatively require an offset representable in either mode.
3988              When performing the check for pairs of X registers i.e.  LDP/STP
3989              pass down DImode since that is the natural size of the LDP/STP
3990              instruction memory accesses.  */
3991           if (mode == TImode || mode == TFmode)
3992             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
3993                     && offset_9bit_signed_unscaled_p (mode, offset));
3994
3995           /* A 7bit offset check because OImode will emit a ldp/stp
3996              instruction (only big endian will get here).
3997              For ldp/stp instructions, the offset is scaled for the size of a
3998              single element of the pair.  */
3999           if (mode == OImode)
4000             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4001
4002           /* Three 9/12 bit offsets checks because CImode will emit three
4003              ldr/str instructions (only big endian will get here).  */
4004           if (mode == CImode)
4005             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4006                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4007                         || offset_12bit_unsigned_scaled_p (V16QImode,
4008                                                            offset + 32)));
4009
4010           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4011              instructions (only big endian will get here).  */
4012           if (mode == XImode)
4013             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4014                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4015                                                             offset + 32));
4016
4017           if (load_store_pair_p)
4018             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4019                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4020           else
4021             return (offset_9bit_signed_unscaled_p (mode, offset)
4022                     || offset_12bit_unsigned_scaled_p (mode, offset));
4023         }
4024
4025       if (allow_reg_index_p)
4026         {
4027           /* Look for base + (scaled/extended) index register.  */
4028           if (aarch64_base_register_rtx_p (op0, strict_p)
4029               && aarch64_classify_index (info, op1, mode, strict_p))
4030             {
4031               info->base = op0;
4032               return true;
4033             }
4034           if (aarch64_base_register_rtx_p (op1, strict_p)
4035               && aarch64_classify_index (info, op0, mode, strict_p))
4036             {
4037               info->base = op1;
4038               return true;
4039             }
4040         }
4041
4042       return false;
4043
4044     case POST_INC:
4045     case POST_DEC:
4046     case PRE_INC:
4047     case PRE_DEC:
4048       info->type = ADDRESS_REG_WB;
4049       info->base = XEXP (x, 0);
4050       info->offset = NULL_RTX;
4051       return aarch64_base_register_rtx_p (info->base, strict_p);
4052
4053     case POST_MODIFY:
4054     case PRE_MODIFY:
4055       info->type = ADDRESS_REG_WB;
4056       info->base = XEXP (x, 0);
4057       if (GET_CODE (XEXP (x, 1)) == PLUS
4058           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4059           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4060           && aarch64_base_register_rtx_p (info->base, strict_p))
4061         {
4062           HOST_WIDE_INT offset;
4063           info->offset = XEXP (XEXP (x, 1), 1);
4064           offset = INTVAL (info->offset);
4065
4066           /* TImode and TFmode values are allowed in both pairs of X
4067              registers and individual Q registers.  The available
4068              address modes are:
4069              X,X: 7-bit signed scaled offset
4070              Q:   9-bit signed offset
4071              We conservatively require an offset representable in either mode.
4072            */
4073           if (mode == TImode || mode == TFmode)
4074             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4075                     && offset_9bit_signed_unscaled_p (mode, offset));
4076
4077           if (load_store_pair_p)
4078             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4079                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4080           else
4081             return offset_9bit_signed_unscaled_p (mode, offset);
4082         }
4083       return false;
4084
4085     case CONST:
4086     case SYMBOL_REF:
4087     case LABEL_REF:
4088       /* load literal: pc-relative constant pool entry.  Only supported
4089          for SI mode or larger.  */
4090       info->type = ADDRESS_SYMBOLIC;
4091
4092       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4093         {
4094           rtx sym, addend;
4095
4096           split_const (x, &sym, &addend);
4097           return ((GET_CODE (sym) == LABEL_REF
4098                    || (GET_CODE (sym) == SYMBOL_REF
4099                        && CONSTANT_POOL_ADDRESS_P (sym)
4100                        && aarch64_pcrelative_literal_loads)));
4101         }
4102       return false;
4103
4104     case LO_SUM:
4105       info->type = ADDRESS_LO_SUM;
4106       info->base = XEXP (x, 0);
4107       info->offset = XEXP (x, 1);
4108       if (allow_reg_index_p
4109           && aarch64_base_register_rtx_p (info->base, strict_p))
4110         {
4111           rtx sym, offs;
4112           split_const (info->offset, &sym, &offs);
4113           if (GET_CODE (sym) == SYMBOL_REF
4114               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4115             {
4116               /* The symbol and offset must be aligned to the access size.  */
4117               unsigned int align;
4118               unsigned int ref_size;
4119
4120               if (CONSTANT_POOL_ADDRESS_P (sym))
4121                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4122               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4123                 {
4124                   tree exp = SYMBOL_REF_DECL (sym);
4125                   align = TYPE_ALIGN (TREE_TYPE (exp));
4126                   align = CONSTANT_ALIGNMENT (exp, align);
4127                 }
4128               else if (SYMBOL_REF_DECL (sym))
4129                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4130               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4131                        && SYMBOL_REF_BLOCK (sym) != NULL)
4132                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4133               else
4134                 align = BITS_PER_UNIT;
4135
4136               ref_size = GET_MODE_SIZE (mode);
4137               if (ref_size == 0)
4138                 ref_size = GET_MODE_SIZE (DImode);
4139
4140               return ((INTVAL (offs) & (ref_size - 1)) == 0
4141                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4142             }
4143         }
4144       return false;
4145
4146     default:
4147       return false;
4148     }
4149 }
4150
4151 bool
4152 aarch64_symbolic_address_p (rtx x)
4153 {
4154   rtx offset;
4155
4156   split_const (x, &x, &offset);
4157   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4158 }
4159
4160 /* Classify the base of symbolic expression X.  */
4161
4162 enum aarch64_symbol_type
4163 aarch64_classify_symbolic_expression (rtx x)
4164 {
4165   rtx offset;
4166
4167   split_const (x, &x, &offset);
4168   return aarch64_classify_symbol (x, offset);
4169 }
4170
4171
4172 /* Return TRUE if X is a legitimate address for accessing memory in
4173    mode MODE.  */
4174 static bool
4175 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4176 {
4177   struct aarch64_address_info addr;
4178
4179   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4180 }
4181
4182 /* Return TRUE if X is a legitimate address for accessing memory in
4183    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4184    pair operation.  */
4185 bool
4186 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4187                               RTX_CODE outer_code, bool strict_p)
4188 {
4189   struct aarch64_address_info addr;
4190
4191   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4192 }
4193
4194 /* Return TRUE if rtx X is immediate constant 0.0 */
4195 bool
4196 aarch64_float_const_zero_rtx_p (rtx x)
4197 {
4198   if (GET_MODE (x) == VOIDmode)
4199     return false;
4200
4201   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4202     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4203   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4204 }
4205
4206 /* Return the fixed registers used for condition codes.  */
4207
4208 static bool
4209 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4210 {
4211   *p1 = CC_REGNUM;
4212   *p2 = INVALID_REGNUM;
4213   return true;
4214 }
4215
4216 /* Emit call insn with PAT and do aarch64-specific handling.  */
4217
4218 void
4219 aarch64_emit_call_insn (rtx pat)
4220 {
4221   rtx insn = emit_call_insn (pat);
4222
4223   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4224   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4225   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4226 }
4227
4228 machine_mode
4229 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4230 {
4231   /* All floating point compares return CCFP if it is an equality
4232      comparison, and CCFPE otherwise.  */
4233   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4234     {
4235       switch (code)
4236         {
4237         case EQ:
4238         case NE:
4239         case UNORDERED:
4240         case ORDERED:
4241         case UNLT:
4242         case UNLE:
4243         case UNGT:
4244         case UNGE:
4245         case UNEQ:
4246         case LTGT:
4247           return CCFPmode;
4248
4249         case LT:
4250         case LE:
4251         case GT:
4252         case GE:
4253           return CCFPEmode;
4254
4255         default:
4256           gcc_unreachable ();
4257         }
4258     }
4259
4260   /* Equality comparisons of short modes against zero can be performed
4261      using the TST instruction with the appropriate bitmask.  */
4262   if (y == const0_rtx && REG_P (x)
4263       && (code == EQ || code == NE)
4264       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4265     return CC_NZmode;
4266
4267   /* Similarly, comparisons of zero_extends from shorter modes can
4268      be performed using an ANDS with an immediate mask.  */
4269   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4270       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4271       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4272       && (code == EQ || code == NE))
4273     return CC_NZmode;
4274
4275   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4276       && y == const0_rtx
4277       && (code == EQ || code == NE || code == LT || code == GE)
4278       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4279           || GET_CODE (x) == NEG
4280           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4281               && CONST_INT_P (XEXP (x, 2)))))
4282     return CC_NZmode;
4283
4284   /* A compare with a shifted operand.  Because of canonicalization,
4285      the comparison will have to be swapped when we emit the assembly
4286      code.  */
4287   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4288       && (REG_P (y) || GET_CODE (y) == SUBREG)
4289       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4290           || GET_CODE (x) == LSHIFTRT
4291           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4292     return CC_SWPmode;
4293
4294   /* Similarly for a negated operand, but we can only do this for
4295      equalities.  */
4296   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4297       && (REG_P (y) || GET_CODE (y) == SUBREG)
4298       && (code == EQ || code == NE)
4299       && GET_CODE (x) == NEG)
4300     return CC_Zmode;
4301
4302   /* A test for unsigned overflow.  */
4303   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4304       && code == NE
4305       && GET_CODE (x) == PLUS
4306       && GET_CODE (y) == ZERO_EXTEND)
4307     return CC_Cmode;
4308
4309   /* For everything else, return CCmode.  */
4310   return CCmode;
4311 }
4312
4313 static int
4314 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4315
4316 int
4317 aarch64_get_condition_code (rtx x)
4318 {
4319   machine_mode mode = GET_MODE (XEXP (x, 0));
4320   enum rtx_code comp_code = GET_CODE (x);
4321
4322   if (GET_MODE_CLASS (mode) != MODE_CC)
4323     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4324   return aarch64_get_condition_code_1 (mode, comp_code);
4325 }
4326
4327 static int
4328 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4329 {
4330   switch (mode)
4331     {
4332     case CCFPmode:
4333     case CCFPEmode:
4334       switch (comp_code)
4335         {
4336         case GE: return AARCH64_GE;
4337         case GT: return AARCH64_GT;
4338         case LE: return AARCH64_LS;
4339         case LT: return AARCH64_MI;
4340         case NE: return AARCH64_NE;
4341         case EQ: return AARCH64_EQ;
4342         case ORDERED: return AARCH64_VC;
4343         case UNORDERED: return AARCH64_VS;
4344         case UNLT: return AARCH64_LT;
4345         case UNLE: return AARCH64_LE;
4346         case UNGT: return AARCH64_HI;
4347         case UNGE: return AARCH64_PL;
4348         default: return -1;
4349         }
4350       break;
4351
4352     case CCmode:
4353       switch (comp_code)
4354         {
4355         case NE: return AARCH64_NE;
4356         case EQ: return AARCH64_EQ;
4357         case GE: return AARCH64_GE;
4358         case GT: return AARCH64_GT;
4359         case LE: return AARCH64_LE;
4360         case LT: return AARCH64_LT;
4361         case GEU: return AARCH64_CS;
4362         case GTU: return AARCH64_HI;
4363         case LEU: return AARCH64_LS;
4364         case LTU: return AARCH64_CC;
4365         default: return -1;
4366         }
4367       break;
4368
4369     case CC_SWPmode:
4370       switch (comp_code)
4371         {
4372         case NE: return AARCH64_NE;
4373         case EQ: return AARCH64_EQ;
4374         case GE: return AARCH64_LE;
4375         case GT: return AARCH64_LT;
4376         case LE: return AARCH64_GE;
4377         case LT: return AARCH64_GT;
4378         case GEU: return AARCH64_LS;
4379         case GTU: return AARCH64_CC;
4380         case LEU: return AARCH64_CS;
4381         case LTU: return AARCH64_HI;
4382         default: return -1;
4383         }
4384       break;
4385
4386     case CC_NZmode:
4387       switch (comp_code)
4388         {
4389         case NE: return AARCH64_NE;
4390         case EQ: return AARCH64_EQ;
4391         case GE: return AARCH64_PL;
4392         case LT: return AARCH64_MI;
4393         default: return -1;
4394         }
4395       break;
4396
4397     case CC_Zmode:
4398       switch (comp_code)
4399         {
4400         case NE: return AARCH64_NE;
4401         case EQ: return AARCH64_EQ;
4402         default: return -1;
4403         }
4404       break;
4405
4406     case CC_Cmode:
4407       switch (comp_code)
4408         {
4409         case NE: return AARCH64_CS;
4410         case EQ: return AARCH64_CC;
4411         default: return -1;
4412         }
4413       break;
4414
4415     default:
4416       return -1;
4417       break;
4418     }
4419
4420   return -1;
4421 }
4422
4423 bool
4424 aarch64_const_vec_all_same_in_range_p (rtx x,
4425                                   HOST_WIDE_INT minval,
4426                                   HOST_WIDE_INT maxval)
4427 {
4428   HOST_WIDE_INT firstval;
4429   int count, i;
4430
4431   if (GET_CODE (x) != CONST_VECTOR
4432       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4433     return false;
4434
4435   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4436   if (firstval < minval || firstval > maxval)
4437     return false;
4438
4439   count = CONST_VECTOR_NUNITS (x);
4440   for (i = 1; i < count; i++)
4441     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4442       return false;
4443
4444   return true;
4445 }
4446
4447 bool
4448 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4449 {
4450   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4451 }
4452
4453
4454 /* N Z C V.  */
4455 #define AARCH64_CC_V 1
4456 #define AARCH64_CC_C (1 << 1)
4457 #define AARCH64_CC_Z (1 << 2)
4458 #define AARCH64_CC_N (1 << 3)
4459
4460 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
4461 static const int aarch64_nzcv_codes[] =
4462 {
4463   0,            /* EQ, Z == 1.  */
4464   AARCH64_CC_Z, /* NE, Z == 0.  */
4465   0,            /* CS, C == 1.  */
4466   AARCH64_CC_C, /* CC, C == 0.  */
4467   0,            /* MI, N == 1.  */
4468   AARCH64_CC_N, /* PL, N == 0.  */
4469   0,            /* VS, V == 1.  */
4470   AARCH64_CC_V, /* VC, V == 0.  */
4471   0,            /* HI, C ==1 && Z == 0.  */
4472   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
4473   AARCH64_CC_V, /* GE, N == V.  */
4474   0,            /* LT, N != V.  */
4475   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
4476   0,            /* LE, !(Z == 0 && N == V).  */
4477   0,            /* AL, Any.  */
4478   0             /* NV, Any.  */
4479 };
4480
4481 static void
4482 aarch64_print_operand (FILE *f, rtx x, int code)
4483 {
4484   switch (code)
4485     {
4486     /* An integer or symbol address without a preceding # sign.  */
4487     case 'c':
4488       switch (GET_CODE (x))
4489         {
4490         case CONST_INT:
4491           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4492           break;
4493
4494         case SYMBOL_REF:
4495           output_addr_const (f, x);
4496           break;
4497
4498         case CONST:
4499           if (GET_CODE (XEXP (x, 0)) == PLUS
4500               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4501             {
4502               output_addr_const (f, x);
4503               break;
4504             }
4505           /* Fall through.  */
4506
4507         default:
4508           output_operand_lossage ("Unsupported operand for code '%c'", code);
4509         }
4510       break;
4511
4512     case 'e':
4513       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4514       {
4515         int n;
4516
4517         if (!CONST_INT_P (x)
4518             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4519           {
4520             output_operand_lossage ("invalid operand for '%%%c'", code);
4521             return;
4522           }
4523
4524         switch (n)
4525           {
4526           case 3:
4527             fputc ('b', f);
4528             break;
4529           case 4:
4530             fputc ('h', f);
4531             break;
4532           case 5:
4533             fputc ('w', f);
4534             break;
4535           default:
4536             output_operand_lossage ("invalid operand for '%%%c'", code);
4537             return;
4538           }
4539       }
4540       break;
4541
4542     case 'p':
4543       {
4544         int n;
4545
4546         /* Print N such that 2^N == X.  */
4547         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4548           {
4549             output_operand_lossage ("invalid operand for '%%%c'", code);
4550             return;
4551           }
4552
4553         asm_fprintf (f, "%d", n);
4554       }
4555       break;
4556
4557     case 'P':
4558       /* Print the number of non-zero bits in X (a const_int).  */
4559       if (!CONST_INT_P (x))
4560         {
4561           output_operand_lossage ("invalid operand for '%%%c'", code);
4562           return;
4563         }
4564
4565       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4566       break;
4567
4568     case 'H':
4569       /* Print the higher numbered register of a pair (TImode) of regs.  */
4570       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4571         {
4572           output_operand_lossage ("invalid operand for '%%%c'", code);
4573           return;
4574         }
4575
4576       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4577       break;
4578
4579     case 'M':
4580     case 'm':
4581       {
4582         int cond_code;
4583         /* Print a condition (eq, ne, etc) or its inverse.  */
4584
4585         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
4586         if (x == const_true_rtx)
4587           {
4588             if (code == 'M')
4589               fputs ("nv", f);
4590             return;
4591           }
4592
4593         if (!COMPARISON_P (x))
4594           {
4595             output_operand_lossage ("invalid operand for '%%%c'", code);
4596             return;
4597           }
4598
4599         cond_code = aarch64_get_condition_code (x);
4600         gcc_assert (cond_code >= 0);
4601         if (code == 'M')
4602           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
4603         fputs (aarch64_condition_codes[cond_code], f);
4604       }
4605       break;
4606
4607     case 'b':
4608     case 'h':
4609     case 's':
4610     case 'd':
4611     case 'q':
4612       /* Print a scalar FP/SIMD register name.  */
4613       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4614         {
4615           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4616           return;
4617         }
4618       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4619       break;
4620
4621     case 'S':
4622     case 'T':
4623     case 'U':
4624     case 'V':
4625       /* Print the first FP/SIMD register name in a list.  */
4626       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4627         {
4628           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4629           return;
4630         }
4631       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4632       break;
4633
4634     case 'R':
4635       /* Print a scalar FP/SIMD register name + 1.  */
4636       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4637         {
4638           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4639           return;
4640         }
4641       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4642       break;
4643
4644     case 'X':
4645       /* Print bottom 16 bits of integer constant in hex.  */
4646       if (!CONST_INT_P (x))
4647         {
4648           output_operand_lossage ("invalid operand for '%%%c'", code);
4649           return;
4650         }
4651       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4652       break;
4653
4654     case 'w':
4655     case 'x':
4656       /* Print a general register name or the zero register (32-bit or
4657          64-bit).  */
4658       if (x == const0_rtx
4659           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4660         {
4661           asm_fprintf (f, "%czr", code);
4662           break;
4663         }
4664
4665       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4666         {
4667           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4668           break;
4669         }
4670
4671       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4672         {
4673           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4674           break;
4675         }
4676
4677       /* Fall through */
4678
4679     case 0:
4680       /* Print a normal operand, if it's a general register, then we
4681          assume DImode.  */
4682       if (x == NULL)
4683         {
4684           output_operand_lossage ("missing operand");
4685           return;
4686         }
4687
4688       switch (GET_CODE (x))
4689         {
4690         case REG:
4691           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4692           break;
4693
4694         case MEM:
4695           output_address (GET_MODE (x), XEXP (x, 0));
4696           break;
4697
4698         case CONST:
4699         case LABEL_REF:
4700         case SYMBOL_REF:
4701           output_addr_const (asm_out_file, x);
4702           break;
4703
4704         case CONST_INT:
4705           asm_fprintf (f, "%wd", INTVAL (x));
4706           break;
4707
4708         case CONST_VECTOR:
4709           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4710             {
4711               gcc_assert (
4712                   aarch64_const_vec_all_same_in_range_p (x,
4713                                                          HOST_WIDE_INT_MIN,
4714                                                          HOST_WIDE_INT_MAX));
4715               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4716             }
4717           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4718             {
4719               fputc ('0', f);
4720             }
4721           else
4722             gcc_unreachable ();
4723           break;
4724
4725         case CONST_DOUBLE:
4726           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4727              be getting CONST_DOUBLEs holding integers.  */
4728           gcc_assert (GET_MODE (x) != VOIDmode);
4729           if (aarch64_float_const_zero_rtx_p (x))
4730             {
4731               fputc ('0', f);
4732               break;
4733             }
4734           else if (aarch64_float_const_representable_p (x))
4735             {
4736 #define buf_size 20
4737               char float_buf[buf_size] = {'\0'};
4738               real_to_decimal_for_mode (float_buf,
4739                                         CONST_DOUBLE_REAL_VALUE (x),
4740                                         buf_size, buf_size,
4741                                         1, GET_MODE (x));
4742               asm_fprintf (asm_out_file, "%s", float_buf);
4743               break;
4744 #undef buf_size
4745             }
4746           output_operand_lossage ("invalid constant");
4747           return;
4748         default:
4749           output_operand_lossage ("invalid operand");
4750           return;
4751         }
4752       break;
4753
4754     case 'A':
4755       if (GET_CODE (x) == HIGH)
4756         x = XEXP (x, 0);
4757
4758       switch (aarch64_classify_symbolic_expression (x))
4759         {
4760         case SYMBOL_SMALL_GOT_4G:
4761           asm_fprintf (asm_out_file, ":got:");
4762           break;
4763
4764         case SYMBOL_SMALL_TLSGD:
4765           asm_fprintf (asm_out_file, ":tlsgd:");
4766           break;
4767
4768         case SYMBOL_SMALL_TLSDESC:
4769           asm_fprintf (asm_out_file, ":tlsdesc:");
4770           break;
4771
4772         case SYMBOL_SMALL_TLSIE:
4773           asm_fprintf (asm_out_file, ":gottprel:");
4774           break;
4775
4776         case SYMBOL_TLSLE24:
4777           asm_fprintf (asm_out_file, ":tprel:");
4778           break;
4779
4780         case SYMBOL_TINY_GOT:
4781           gcc_unreachable ();
4782           break;
4783
4784         default:
4785           break;
4786         }
4787       output_addr_const (asm_out_file, x);
4788       break;
4789
4790     case 'L':
4791       switch (aarch64_classify_symbolic_expression (x))
4792         {
4793         case SYMBOL_SMALL_GOT_4G:
4794           asm_fprintf (asm_out_file, ":lo12:");
4795           break;
4796
4797         case SYMBOL_SMALL_TLSGD:
4798           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4799           break;
4800
4801         case SYMBOL_SMALL_TLSDESC:
4802           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4803           break;
4804
4805         case SYMBOL_SMALL_TLSIE:
4806           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4807           break;
4808
4809         case SYMBOL_TLSLE12:
4810           asm_fprintf (asm_out_file, ":tprel_lo12:");
4811           break;
4812
4813         case SYMBOL_TLSLE24:
4814           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4815           break;
4816
4817         case SYMBOL_TINY_GOT:
4818           asm_fprintf (asm_out_file, ":got:");
4819           break;
4820
4821         case SYMBOL_TINY_TLSIE:
4822           asm_fprintf (asm_out_file, ":gottprel:");
4823           break;
4824
4825         default:
4826           break;
4827         }
4828       output_addr_const (asm_out_file, x);
4829       break;
4830
4831     case 'G':
4832
4833       switch (aarch64_classify_symbolic_expression (x))
4834         {
4835         case SYMBOL_TLSLE24:
4836           asm_fprintf (asm_out_file, ":tprel_hi12:");
4837           break;
4838         default:
4839           break;
4840         }
4841       output_addr_const (asm_out_file, x);
4842       break;
4843
4844     case 'k':
4845       {
4846         HOST_WIDE_INT cond_code;
4847         /* Print nzcv.  */
4848
4849         if (!CONST_INT_P (x))
4850           {
4851             output_operand_lossage ("invalid operand for '%%%c'", code);
4852             return;
4853           }
4854
4855         cond_code = INTVAL (x);
4856         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
4857         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
4858       }
4859       break;
4860
4861     default:
4862       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4863       return;
4864     }
4865 }
4866
4867 static void
4868 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
4869 {
4870   struct aarch64_address_info addr;
4871
4872   if (aarch64_classify_address (&addr, x, mode, MEM, true))
4873     switch (addr.type)
4874       {
4875       case ADDRESS_REG_IMM:
4876         if (addr.offset == const0_rtx)
4877           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4878         else
4879           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4880                        INTVAL (addr.offset));
4881         return;
4882
4883       case ADDRESS_REG_REG:
4884         if (addr.shift == 0)
4885           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4886                        reg_names [REGNO (addr.offset)]);
4887         else
4888           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4889                        reg_names [REGNO (addr.offset)], addr.shift);
4890         return;
4891
4892       case ADDRESS_REG_UXTW:
4893         if (addr.shift == 0)
4894           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4895                        REGNO (addr.offset) - R0_REGNUM);
4896         else
4897           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4898                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4899         return;
4900
4901       case ADDRESS_REG_SXTW:
4902         if (addr.shift == 0)
4903           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4904                        REGNO (addr.offset) - R0_REGNUM);
4905         else
4906           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4907                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4908         return;
4909
4910       case ADDRESS_REG_WB:
4911         switch (GET_CODE (x))
4912           {
4913           case PRE_INC:
4914             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4915                          GET_MODE_SIZE (mode));
4916             return;
4917           case POST_INC:
4918             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4919                          GET_MODE_SIZE (mode));
4920             return;
4921           case PRE_DEC:
4922             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4923                          GET_MODE_SIZE (mode));
4924             return;
4925           case POST_DEC:
4926             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4927                          GET_MODE_SIZE (mode));
4928             return;
4929           case PRE_MODIFY:
4930             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4931                          INTVAL (addr.offset));
4932             return;
4933           case POST_MODIFY:
4934             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4935                          INTVAL (addr.offset));
4936             return;
4937           default:
4938             break;
4939           }
4940         break;
4941
4942       case ADDRESS_LO_SUM:
4943         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4944         output_addr_const (f, addr.offset);
4945         asm_fprintf (f, "]");
4946         return;
4947
4948       case ADDRESS_SYMBOLIC:
4949         break;
4950       }
4951
4952   output_addr_const (f, x);
4953 }
4954
4955 bool
4956 aarch64_label_mentioned_p (rtx x)
4957 {
4958   const char *fmt;
4959   int i;
4960
4961   if (GET_CODE (x) == LABEL_REF)
4962     return true;
4963
4964   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4965      referencing instruction, but they are constant offsets, not
4966      symbols.  */
4967   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4968     return false;
4969
4970   fmt = GET_RTX_FORMAT (GET_CODE (x));
4971   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4972     {
4973       if (fmt[i] == 'E')
4974         {
4975           int j;
4976
4977           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4978             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4979               return 1;
4980         }
4981       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4982         return 1;
4983     }
4984
4985   return 0;
4986 }
4987
4988 /* Implement REGNO_REG_CLASS.  */
4989
4990 enum reg_class
4991 aarch64_regno_regclass (unsigned regno)
4992 {
4993   if (GP_REGNUM_P (regno))
4994     return GENERAL_REGS;
4995
4996   if (regno == SP_REGNUM)
4997     return STACK_REG;
4998
4999   if (regno == FRAME_POINTER_REGNUM
5000       || regno == ARG_POINTER_REGNUM)
5001     return POINTER_REGS;
5002
5003   if (FP_REGNUM_P (regno))
5004     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5005
5006   return NO_REGS;
5007 }
5008
5009 static rtx
5010 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5011 {
5012   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5013      where mask is selected by alignment and size of the offset.
5014      We try to pick as large a range for the offset as possible to
5015      maximize the chance of a CSE.  However, for aligned addresses
5016      we limit the range to 4k so that structures with different sized
5017      elements are likely to use the same base.  We need to be careful
5018      not to split a CONST for some forms of address expression, otherwise
5019      it will generate sub-optimal code.  */
5020
5021   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5022     {
5023       rtx base = XEXP (x, 0);
5024       rtx offset_rtx = XEXP (x, 1);
5025       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5026
5027       if (GET_CODE (base) == PLUS)
5028         {
5029           rtx op0 = XEXP (base, 0);
5030           rtx op1 = XEXP (base, 1);
5031
5032           /* Force any scaling into a temp for CSE.  */
5033           op0 = force_reg (Pmode, op0);
5034           op1 = force_reg (Pmode, op1);
5035
5036           /* Let the pointer register be in op0.  */
5037           if (REG_POINTER (op1))
5038             std::swap (op0, op1);
5039
5040           /* If the pointer is virtual or frame related, then we know that
5041              virtual register instantiation or register elimination is going
5042              to apply a second constant.  We want the two constants folded
5043              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5044           if (virt_or_elim_regno_p (REGNO (op0)))
5045             {
5046               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5047                                    NULL_RTX, true, OPTAB_DIRECT);
5048               return gen_rtx_PLUS (Pmode, base, op1);
5049             }
5050
5051           /* Otherwise, in order to encourage CSE (and thence loop strength
5052              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5053           base = expand_binop (Pmode, add_optab, op0, op1,
5054                                NULL_RTX, true, OPTAB_DIRECT);
5055           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5056         }
5057
5058       /* Does it look like we'll need a load/store-pair operation?  */
5059       HOST_WIDE_INT base_offset;
5060       if (GET_MODE_SIZE (mode) > 16
5061           || mode == TImode)
5062         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
5063                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
5064       /* For offsets aren't a multiple of the access size, the limit is
5065          -256...255.  */
5066       else if (offset & (GET_MODE_SIZE (mode) - 1))
5067         base_offset = (offset + 0x100) & ~0x1ff;
5068       else
5069         base_offset = offset & ~0xfff;
5070
5071       if (base_offset != 0)
5072         {
5073           base = plus_constant (Pmode, base, base_offset);
5074           base = force_operand (base, NULL_RTX);
5075           return plus_constant (Pmode, base, offset - base_offset);
5076         }
5077     }
5078
5079   return x;
5080 }
5081
5082 /* Return the reload icode required for a constant pool in mode.  */
5083 static enum insn_code
5084 aarch64_constant_pool_reload_icode (machine_mode mode)
5085 {
5086   switch (mode)
5087     {
5088     case SFmode:
5089       return CODE_FOR_aarch64_reload_movcpsfdi;
5090
5091     case DFmode:
5092       return CODE_FOR_aarch64_reload_movcpdfdi;
5093
5094     case TFmode:
5095       return CODE_FOR_aarch64_reload_movcptfdi;
5096
5097     case V8QImode:
5098       return CODE_FOR_aarch64_reload_movcpv8qidi;
5099
5100     case V16QImode:
5101       return CODE_FOR_aarch64_reload_movcpv16qidi;
5102
5103     case V4HImode:
5104       return CODE_FOR_aarch64_reload_movcpv4hidi;
5105
5106     case V8HImode:
5107       return CODE_FOR_aarch64_reload_movcpv8hidi;
5108
5109     case V2SImode:
5110       return CODE_FOR_aarch64_reload_movcpv2sidi;
5111
5112     case V4SImode:
5113       return CODE_FOR_aarch64_reload_movcpv4sidi;
5114
5115     case V2DImode:
5116       return CODE_FOR_aarch64_reload_movcpv2didi;
5117
5118     case V2DFmode:
5119       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5120
5121     default:
5122       gcc_unreachable ();
5123     }
5124
5125   gcc_unreachable ();
5126 }
5127 static reg_class_t
5128 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5129                           reg_class_t rclass,
5130                           machine_mode mode,
5131                           secondary_reload_info *sri)
5132 {
5133
5134   /* If we have to disable direct literal pool loads and stores because the
5135      function is too big, then we need a scratch register.  */
5136   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5137       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5138           || targetm.vector_mode_supported_p (GET_MODE (x)))
5139       && !aarch64_pcrelative_literal_loads)
5140     {
5141       sri->icode = aarch64_constant_pool_reload_icode (mode);
5142       return NO_REGS;
5143     }
5144
5145   /* Without the TARGET_SIMD instructions we cannot move a Q register
5146      to a Q register directly.  We need a scratch.  */
5147   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5148       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5149       && reg_class_subset_p (rclass, FP_REGS))
5150     {
5151       if (mode == TFmode)
5152         sri->icode = CODE_FOR_aarch64_reload_movtf;
5153       else if (mode == TImode)
5154         sri->icode = CODE_FOR_aarch64_reload_movti;
5155       return NO_REGS;
5156     }
5157
5158   /* A TFmode or TImode memory access should be handled via an FP_REGS
5159      because AArch64 has richer addressing modes for LDR/STR instructions
5160      than LDP/STP instructions.  */
5161   if (TARGET_FLOAT && rclass == GENERAL_REGS
5162       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5163     return FP_REGS;
5164
5165   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5166       return GENERAL_REGS;
5167
5168   return NO_REGS;
5169 }
5170
5171 static bool
5172 aarch64_can_eliminate (const int from, const int to)
5173 {
5174   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5175      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5176
5177   if (frame_pointer_needed)
5178     {
5179       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5180         return true;
5181       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5182         return false;
5183       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5184           && !cfun->calls_alloca)
5185         return true;
5186       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5187         return true;
5188
5189       return false;
5190     }
5191   else
5192     {
5193       /* If we decided that we didn't need a leaf frame pointer but then used
5194          LR in the function, then we'll want a frame pointer after all, so
5195          prevent this elimination to ensure a frame pointer is used.  */
5196       if (to == STACK_POINTER_REGNUM
5197           && flag_omit_leaf_frame_pointer
5198           && df_regs_ever_live_p (LR_REGNUM))
5199         return false;
5200     }
5201
5202   return true;
5203 }
5204
5205 HOST_WIDE_INT
5206 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5207 {
5208   aarch64_layout_frame ();
5209
5210   if (to == HARD_FRAME_POINTER_REGNUM)
5211     {
5212       if (from == ARG_POINTER_REGNUM)
5213         return cfun->machine->frame.hard_fp_offset;
5214
5215       if (from == FRAME_POINTER_REGNUM)
5216         return cfun->machine->frame.hard_fp_offset
5217                - cfun->machine->frame.locals_offset;
5218     }
5219
5220   if (to == STACK_POINTER_REGNUM)
5221     {
5222       if (from == FRAME_POINTER_REGNUM)
5223           return cfun->machine->frame.frame_size
5224                  - cfun->machine->frame.locals_offset;
5225     }
5226
5227   return cfun->machine->frame.frame_size;
5228 }
5229
5230 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5231    previous frame.  */
5232
5233 rtx
5234 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5235 {
5236   if (count != 0)
5237     return const0_rtx;
5238   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5239 }
5240
5241
5242 static void
5243 aarch64_asm_trampoline_template (FILE *f)
5244 {
5245   if (TARGET_ILP32)
5246     {
5247       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5248       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5249     }
5250   else
5251     {
5252       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5253       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5254     }
5255   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5256   assemble_aligned_integer (4, const0_rtx);
5257   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5258   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5259 }
5260
5261 static void
5262 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5263 {
5264   rtx fnaddr, mem, a_tramp;
5265   const int tramp_code_sz = 16;
5266
5267   /* Don't need to copy the trailing D-words, we fill those in below.  */
5268   emit_block_move (m_tramp, assemble_trampoline_template (),
5269                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5270   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5271   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5272   if (GET_MODE (fnaddr) != ptr_mode)
5273     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5274   emit_move_insn (mem, fnaddr);
5275
5276   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5277   emit_move_insn (mem, chain_value);
5278
5279   /* XXX We should really define a "clear_cache" pattern and use
5280      gen_clear_cache().  */
5281   a_tramp = XEXP (m_tramp, 0);
5282   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5283                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5284                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5285                      ptr_mode);
5286 }
5287
5288 static unsigned char
5289 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5290 {
5291   switch (regclass)
5292     {
5293     case CALLER_SAVE_REGS:
5294     case POINTER_REGS:
5295     case GENERAL_REGS:
5296     case ALL_REGS:
5297     case FP_REGS:
5298     case FP_LO_REGS:
5299       return
5300         aarch64_vector_mode_p (mode)
5301           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5302           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5303     case STACK_REG:
5304       return 1;
5305
5306     case NO_REGS:
5307       return 0;
5308
5309     default:
5310       break;
5311     }
5312   gcc_unreachable ();
5313 }
5314
5315 static reg_class_t
5316 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5317 {
5318   if (regclass == POINTER_REGS)
5319     return GENERAL_REGS;
5320
5321   if (regclass == STACK_REG)
5322     {
5323       if (REG_P(x)
5324           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5325           return regclass;
5326
5327       return NO_REGS;
5328     }
5329
5330   /* If it's an integer immediate that MOVI can't handle, then
5331      FP_REGS is not an option, so we return NO_REGS instead.  */
5332   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5333       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5334     return NO_REGS;
5335
5336   /* Register eliminiation can result in a request for
5337      SP+constant->FP_REGS.  We cannot support such operations which
5338      use SP as source and an FP_REG as destination, so reject out
5339      right now.  */
5340   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5341     {
5342       rtx lhs = XEXP (x, 0);
5343
5344       /* Look through a possible SUBREG introduced by ILP32.  */
5345       if (GET_CODE (lhs) == SUBREG)
5346         lhs = SUBREG_REG (lhs);
5347
5348       gcc_assert (REG_P (lhs));
5349       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5350                                       POINTER_REGS));
5351       return NO_REGS;
5352     }
5353
5354   return regclass;
5355 }
5356
5357 void
5358 aarch64_asm_output_labelref (FILE* f, const char *name)
5359 {
5360   asm_fprintf (f, "%U%s", name);
5361 }
5362
5363 static void
5364 aarch64_elf_asm_constructor (rtx symbol, int priority)
5365 {
5366   if (priority == DEFAULT_INIT_PRIORITY)
5367     default_ctor_section_asm_out_constructor (symbol, priority);
5368   else
5369     {
5370       section *s;
5371       char buf[18];
5372       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5373       s = get_section (buf, SECTION_WRITE, NULL);
5374       switch_to_section (s);
5375       assemble_align (POINTER_SIZE);
5376       assemble_aligned_integer (POINTER_BYTES, symbol);
5377     }
5378 }
5379
5380 static void
5381 aarch64_elf_asm_destructor (rtx symbol, int priority)
5382 {
5383   if (priority == DEFAULT_INIT_PRIORITY)
5384     default_dtor_section_asm_out_destructor (symbol, priority);
5385   else
5386     {
5387       section *s;
5388       char buf[18];
5389       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5390       s = get_section (buf, SECTION_WRITE, NULL);
5391       switch_to_section (s);
5392       assemble_align (POINTER_SIZE);
5393       assemble_aligned_integer (POINTER_BYTES, symbol);
5394     }
5395 }
5396
5397 const char*
5398 aarch64_output_casesi (rtx *operands)
5399 {
5400   char buf[100];
5401   char label[100];
5402   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5403   int index;
5404   static const char *const patterns[4][2] =
5405   {
5406     {
5407       "ldrb\t%w3, [%0,%w1,uxtw]",
5408       "add\t%3, %4, %w3, sxtb #2"
5409     },
5410     {
5411       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5412       "add\t%3, %4, %w3, sxth #2"
5413     },
5414     {
5415       "ldr\t%w3, [%0,%w1,uxtw #2]",
5416       "add\t%3, %4, %w3, sxtw #2"
5417     },
5418     /* We assume that DImode is only generated when not optimizing and
5419        that we don't really need 64-bit address offsets.  That would
5420        imply an object file with 8GB of code in a single function!  */
5421     {
5422       "ldr\t%w3, [%0,%w1,uxtw #2]",
5423       "add\t%3, %4, %w3, sxtw #2"
5424     }
5425   };
5426
5427   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5428
5429   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5430
5431   gcc_assert (index >= 0 && index <= 3);
5432
5433   /* Need to implement table size reduction, by chaning the code below.  */
5434   output_asm_insn (patterns[index][0], operands);
5435   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5436   snprintf (buf, sizeof (buf),
5437             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5438   output_asm_insn (buf, operands);
5439   output_asm_insn (patterns[index][1], operands);
5440   output_asm_insn ("br\t%3", operands);
5441   assemble_label (asm_out_file, label);
5442   return "";
5443 }
5444
5445
5446 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5447    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5448    operator.  */
5449
5450 int
5451 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5452 {
5453   if (shift >= 0 && shift <= 3)
5454     {
5455       int size;
5456       for (size = 8; size <= 32; size *= 2)
5457         {
5458           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5459           if (mask == bits << shift)
5460             return size;
5461         }
5462     }
5463   return 0;
5464 }
5465
5466 /* Constant pools are per function only when PC relative
5467    literal loads are true or we are in the large memory
5468    model.  */
5469
5470 static inline bool
5471 aarch64_can_use_per_function_literal_pools_p (void)
5472 {
5473   return (aarch64_pcrelative_literal_loads
5474           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5475 }
5476
5477 static bool
5478 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5479 {
5480   /* Fixme:: In an ideal world this would work similar
5481      to the logic in aarch64_select_rtx_section but this
5482      breaks bootstrap in gcc go.  For now we workaround
5483      this by returning false here.  */
5484   return false;
5485 }
5486
5487 /* Select appropriate section for constants depending
5488    on where we place literal pools.  */
5489
5490 static section *
5491 aarch64_select_rtx_section (machine_mode mode,
5492                             rtx x,
5493                             unsigned HOST_WIDE_INT align)
5494 {
5495   if (aarch64_can_use_per_function_literal_pools_p ())
5496     return function_section (current_function_decl);
5497
5498   return default_elf_select_rtx_section (mode, x, align);
5499 }
5500
5501 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
5502 void
5503 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5504                                   HOST_WIDE_INT offset)
5505 {
5506   /* When using per-function literal pools, we must ensure that any code
5507      section is aligned to the minimal instruction length, lest we get
5508      errors from the assembler re "unaligned instructions".  */
5509   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5510     ASM_OUTPUT_ALIGN (f, 2);
5511 }
5512
5513 /* Costs.  */
5514
5515 /* Helper function for rtx cost calculation.  Strip a shift expression
5516    from X.  Returns the inner operand if successful, or the original
5517    expression on failure.  */
5518 static rtx
5519 aarch64_strip_shift (rtx x)
5520 {
5521   rtx op = x;
5522
5523   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5524      we can convert both to ROR during final output.  */
5525   if ((GET_CODE (op) == ASHIFT
5526        || GET_CODE (op) == ASHIFTRT
5527        || GET_CODE (op) == LSHIFTRT
5528        || GET_CODE (op) == ROTATERT
5529        || GET_CODE (op) == ROTATE)
5530       && CONST_INT_P (XEXP (op, 1)))
5531     return XEXP (op, 0);
5532
5533   if (GET_CODE (op) == MULT
5534       && CONST_INT_P (XEXP (op, 1))
5535       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5536     return XEXP (op, 0);
5537
5538   return x;
5539 }
5540
5541 /* Helper function for rtx cost calculation.  Strip an extend
5542    expression from X.  Returns the inner operand if successful, or the
5543    original expression on failure.  We deal with a number of possible
5544    canonicalization variations here.  */
5545 static rtx
5546 aarch64_strip_extend (rtx x)
5547 {
5548   rtx op = x;
5549
5550   /* Zero and sign extraction of a widened value.  */
5551   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5552       && XEXP (op, 2) == const0_rtx
5553       && GET_CODE (XEXP (op, 0)) == MULT
5554       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5555                                          XEXP (op, 1)))
5556     return XEXP (XEXP (op, 0), 0);
5557
5558   /* It can also be represented (for zero-extend) as an AND with an
5559      immediate.  */
5560   if (GET_CODE (op) == AND
5561       && GET_CODE (XEXP (op, 0)) == MULT
5562       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5563       && CONST_INT_P (XEXP (op, 1))
5564       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5565                            INTVAL (XEXP (op, 1))) != 0)
5566     return XEXP (XEXP (op, 0), 0);
5567
5568   /* Now handle extended register, as this may also have an optional
5569      left shift by 1..4.  */
5570   if (GET_CODE (op) == ASHIFT
5571       && CONST_INT_P (XEXP (op, 1))
5572       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5573     op = XEXP (op, 0);
5574
5575   if (GET_CODE (op) == ZERO_EXTEND
5576       || GET_CODE (op) == SIGN_EXTEND)
5577     op = XEXP (op, 0);
5578
5579   if (op != x)
5580     return op;
5581
5582   return x;
5583 }
5584
5585 /* Return true iff CODE is a shift supported in combination
5586    with arithmetic instructions.  */
5587
5588 static bool
5589 aarch64_shift_p (enum rtx_code code)
5590 {
5591   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5592 }
5593
5594 /* Helper function for rtx cost calculation.  Calculate the cost of
5595    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5596    Return the calculated cost of the expression, recursing manually in to
5597    operands where needed.  */
5598
5599 static int
5600 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5601 {
5602   rtx op0, op1;
5603   const struct cpu_cost_table *extra_cost
5604     = aarch64_tune_params.insn_extra_cost;
5605   int cost = 0;
5606   bool compound_p = (outer == PLUS || outer == MINUS);
5607   machine_mode mode = GET_MODE (x);
5608
5609   gcc_checking_assert (code == MULT);
5610
5611   op0 = XEXP (x, 0);
5612   op1 = XEXP (x, 1);
5613
5614   if (VECTOR_MODE_P (mode))
5615     mode = GET_MODE_INNER (mode);
5616
5617   /* Integer multiply/fma.  */
5618   if (GET_MODE_CLASS (mode) == MODE_INT)
5619     {
5620       /* The multiply will be canonicalized as a shift, cost it as such.  */
5621       if (aarch64_shift_p (GET_CODE (x))
5622           || (CONST_INT_P (op1)
5623               && exact_log2 (INTVAL (op1)) > 0))
5624         {
5625           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5626                            || GET_CODE (op0) == SIGN_EXTEND;
5627           if (speed)
5628             {
5629               if (compound_p)
5630                 {
5631                   if (REG_P (op1))
5632                     /* ARITH + shift-by-register.  */
5633                     cost += extra_cost->alu.arith_shift_reg;
5634                   else if (is_extend)
5635                     /* ARITH + extended register.  We don't have a cost field
5636                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5637                     cost += extra_cost->alu.extend_arith;
5638                   else
5639                     /* ARITH + shift-by-immediate.  */
5640                     cost += extra_cost->alu.arith_shift;
5641                 }
5642               else
5643                 /* LSL (immediate).  */
5644                 cost += extra_cost->alu.shift;
5645
5646             }
5647           /* Strip extends as we will have costed them in the case above.  */
5648           if (is_extend)
5649             op0 = aarch64_strip_extend (op0);
5650
5651           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5652
5653           return cost;
5654         }
5655
5656       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5657          compound and let the below cases handle it.  After all, MNEG is a
5658          special-case alias of MSUB.  */
5659       if (GET_CODE (op0) == NEG)
5660         {
5661           op0 = XEXP (op0, 0);
5662           compound_p = true;
5663         }
5664
5665       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5666       if ((GET_CODE (op0) == ZERO_EXTEND
5667            && GET_CODE (op1) == ZERO_EXTEND)
5668           || (GET_CODE (op0) == SIGN_EXTEND
5669               && GET_CODE (op1) == SIGN_EXTEND))
5670         {
5671           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5672           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5673
5674           if (speed)
5675             {
5676               if (compound_p)
5677                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5678                 cost += extra_cost->mult[0].extend_add;
5679               else
5680                 /* MUL/SMULL/UMULL.  */
5681                 cost += extra_cost->mult[0].extend;
5682             }
5683
5684           return cost;
5685         }
5686
5687       /* This is either an integer multiply or a MADD.  In both cases
5688          we want to recurse and cost the operands.  */
5689       cost += rtx_cost (op0, mode, MULT, 0, speed);
5690       cost += rtx_cost (op1, mode, MULT, 1, speed);
5691
5692       if (speed)
5693         {
5694           if (compound_p)
5695             /* MADD/MSUB.  */
5696             cost += extra_cost->mult[mode == DImode].add;
5697           else
5698             /* MUL.  */
5699             cost += extra_cost->mult[mode == DImode].simple;
5700         }
5701
5702       return cost;
5703     }
5704   else
5705     {
5706       if (speed)
5707         {
5708           /* Floating-point FMA/FMUL can also support negations of the
5709              operands, unless the rounding mode is upward or downward in
5710              which case FNMUL is different than FMUL with operand negation.  */
5711           bool neg0 = GET_CODE (op0) == NEG;
5712           bool neg1 = GET_CODE (op1) == NEG;
5713           if (compound_p || !flag_rounding_math || (neg0 && neg1))
5714             {
5715               if (neg0)
5716                 op0 = XEXP (op0, 0);
5717               if (neg1)
5718                 op1 = XEXP (op1, 0);
5719             }
5720
5721           if (compound_p)
5722             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5723             cost += extra_cost->fp[mode == DFmode].fma;
5724           else
5725             /* FMUL/FNMUL.  */
5726             cost += extra_cost->fp[mode == DFmode].mult;
5727         }
5728
5729       cost += rtx_cost (op0, mode, MULT, 0, speed);
5730       cost += rtx_cost (op1, mode, MULT, 1, speed);
5731       return cost;
5732     }
5733 }
5734
5735 static int
5736 aarch64_address_cost (rtx x,
5737                       machine_mode mode,
5738                       addr_space_t as ATTRIBUTE_UNUSED,
5739                       bool speed)
5740 {
5741   enum rtx_code c = GET_CODE (x);
5742   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5743   struct aarch64_address_info info;
5744   int cost = 0;
5745   info.shift = 0;
5746
5747   if (!aarch64_classify_address (&info, x, mode, c, false))
5748     {
5749       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5750         {
5751           /* This is a CONST or SYMBOL ref which will be split
5752              in a different way depending on the code model in use.
5753              Cost it through the generic infrastructure.  */
5754           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5755           /* Divide through by the cost of one instruction to
5756              bring it to the same units as the address costs.  */
5757           cost_symbol_ref /= COSTS_N_INSNS (1);
5758           /* The cost is then the cost of preparing the address,
5759              followed by an immediate (possibly 0) offset.  */
5760           return cost_symbol_ref + addr_cost->imm_offset;
5761         }
5762       else
5763         {
5764           /* This is most likely a jump table from a case
5765              statement.  */
5766           return addr_cost->register_offset;
5767         }
5768     }
5769
5770   switch (info.type)
5771     {
5772       case ADDRESS_LO_SUM:
5773       case ADDRESS_SYMBOLIC:
5774       case ADDRESS_REG_IMM:
5775         cost += addr_cost->imm_offset;
5776         break;
5777
5778       case ADDRESS_REG_WB:
5779         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5780           cost += addr_cost->pre_modify;
5781         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5782           cost += addr_cost->post_modify;
5783         else
5784           gcc_unreachable ();
5785
5786         break;
5787
5788       case ADDRESS_REG_REG:
5789         cost += addr_cost->register_offset;
5790         break;
5791
5792       case ADDRESS_REG_SXTW:
5793         cost += addr_cost->register_sextend;
5794         break;
5795
5796       case ADDRESS_REG_UXTW:
5797         cost += addr_cost->register_zextend;
5798         break;
5799
5800       default:
5801         gcc_unreachable ();
5802     }
5803
5804
5805   if (info.shift > 0)
5806     {
5807       /* For the sake of calculating the cost of the shifted register
5808          component, we can treat same sized modes in the same way.  */
5809       switch (GET_MODE_BITSIZE (mode))
5810         {
5811           case 16:
5812             cost += addr_cost->addr_scale_costs.hi;
5813             break;
5814
5815           case 32:
5816             cost += addr_cost->addr_scale_costs.si;
5817             break;
5818
5819           case 64:
5820             cost += addr_cost->addr_scale_costs.di;
5821             break;
5822
5823           /* We can't tell, or this is a 128-bit vector.  */
5824           default:
5825             cost += addr_cost->addr_scale_costs.ti;
5826             break;
5827         }
5828     }
5829
5830   return cost;
5831 }
5832
5833 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
5834    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
5835    to be taken.  */
5836
5837 int
5838 aarch64_branch_cost (bool speed_p, bool predictable_p)
5839 {
5840   /* When optimizing for speed, use the cost of unpredictable branches.  */
5841   const struct cpu_branch_cost *branch_costs =
5842     aarch64_tune_params.branch_costs;
5843
5844   if (!speed_p || predictable_p)
5845     return branch_costs->predictable;
5846   else
5847     return branch_costs->unpredictable;
5848 }
5849
5850 /* Return true if the RTX X in mode MODE is a zero or sign extract
5851    usable in an ADD or SUB (extended register) instruction.  */
5852 static bool
5853 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5854 {
5855   /* Catch add with a sign extract.
5856      This is add_<optab><mode>_multp2.  */
5857   if (GET_CODE (x) == SIGN_EXTRACT
5858       || GET_CODE (x) == ZERO_EXTRACT)
5859     {
5860       rtx op0 = XEXP (x, 0);
5861       rtx op1 = XEXP (x, 1);
5862       rtx op2 = XEXP (x, 2);
5863
5864       if (GET_CODE (op0) == MULT
5865           && CONST_INT_P (op1)
5866           && op2 == const0_rtx
5867           && CONST_INT_P (XEXP (op0, 1))
5868           && aarch64_is_extend_from_extract (mode,
5869                                              XEXP (op0, 1),
5870                                              op1))
5871         {
5872           return true;
5873         }
5874     }
5875   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
5876      No shift.  */
5877   else if (GET_CODE (x) == SIGN_EXTEND
5878            || GET_CODE (x) == ZERO_EXTEND)
5879     return REG_P (XEXP (x, 0));
5880
5881   return false;
5882 }
5883
5884 static bool
5885 aarch64_frint_unspec_p (unsigned int u)
5886 {
5887   switch (u)
5888     {
5889       case UNSPEC_FRINTZ:
5890       case UNSPEC_FRINTP:
5891       case UNSPEC_FRINTM:
5892       case UNSPEC_FRINTA:
5893       case UNSPEC_FRINTN:
5894       case UNSPEC_FRINTX:
5895       case UNSPEC_FRINTI:
5896         return true;
5897
5898       default:
5899         return false;
5900     }
5901 }
5902
5903 /* Return true iff X is an rtx that will match an extr instruction
5904    i.e. as described in the *extr<mode>5_insn family of patterns.
5905    OP0 and OP1 will be set to the operands of the shifts involved
5906    on success and will be NULL_RTX otherwise.  */
5907
5908 static bool
5909 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5910 {
5911   rtx op0, op1;
5912   machine_mode mode = GET_MODE (x);
5913
5914   *res_op0 = NULL_RTX;
5915   *res_op1 = NULL_RTX;
5916
5917   if (GET_CODE (x) != IOR)
5918     return false;
5919
5920   op0 = XEXP (x, 0);
5921   op1 = XEXP (x, 1);
5922
5923   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5924       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5925     {
5926      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
5927       if (GET_CODE (op1) == ASHIFT)
5928         std::swap (op0, op1);
5929
5930       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5931         return false;
5932
5933       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5934       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5935
5936       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5937           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5938         {
5939           *res_op0 = XEXP (op0, 0);
5940           *res_op1 = XEXP (op1, 0);
5941           return true;
5942         }
5943     }
5944
5945   return false;
5946 }
5947
5948 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5949    storing it in *COST.  Result is true if the total cost of the operation
5950    has now been calculated.  */
5951 static bool
5952 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5953 {
5954   rtx inner;
5955   rtx comparator;
5956   enum rtx_code cmpcode;
5957
5958   if (COMPARISON_P (op0))
5959     {
5960       inner = XEXP (op0, 0);
5961       comparator = XEXP (op0, 1);
5962       cmpcode = GET_CODE (op0);
5963     }
5964   else
5965     {
5966       inner = op0;
5967       comparator = const0_rtx;
5968       cmpcode = NE;
5969     }
5970
5971   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5972     {
5973       /* Conditional branch.  */
5974       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5975         return true;
5976       else
5977         {
5978           if (cmpcode == NE || cmpcode == EQ)
5979             {
5980               if (comparator == const0_rtx)
5981                 {
5982                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5983                   if (GET_CODE (inner) == ZERO_EXTRACT)
5984                     /* TBZ/TBNZ.  */
5985                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
5986                                        ZERO_EXTRACT, 0, speed);
5987                   else
5988                     /* CBZ/CBNZ.  */
5989                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
5990
5991                 return true;
5992               }
5993             }
5994           else if (cmpcode == LT || cmpcode == GE)
5995             {
5996               /* TBZ/TBNZ.  */
5997               if (comparator == const0_rtx)
5998                 return true;
5999             }
6000         }
6001     }
6002   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6003     {
6004       /* CCMP.  */
6005       if (GET_CODE (op1) == COMPARE)
6006         {
6007           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6008           if (XEXP (op1, 1) == const0_rtx)
6009             *cost += 1;
6010           if (speed)
6011             {
6012               machine_mode mode = GET_MODE (XEXP (op1, 0));
6013               const struct cpu_cost_table *extra_cost
6014                 = aarch64_tune_params.insn_extra_cost;
6015
6016               if (GET_MODE_CLASS (mode) == MODE_INT)
6017                 *cost += extra_cost->alu.arith;
6018               else
6019                 *cost += extra_cost->fp[mode == DFmode].compare;
6020             }
6021           return true;
6022         }
6023
6024       /* It's a conditional operation based on the status flags,
6025          so it must be some flavor of CSEL.  */
6026
6027       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6028       if (GET_CODE (op1) == NEG
6029           || GET_CODE (op1) == NOT
6030           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6031         op1 = XEXP (op1, 0);
6032       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6033         {
6034           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6035           op1 = XEXP (op1, 0);
6036           op2 = XEXP (op2, 0);
6037         }
6038
6039       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6040       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6041       return true;
6042     }
6043
6044   /* We don't know what this is, cost all operands.  */
6045   return false;
6046 }
6047
6048 /* Check whether X is a bitfield operation of the form shift + extend that
6049    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6050    operand to which the bitfield operation is applied.  Otherwise return
6051    NULL_RTX.  */
6052
6053 static rtx
6054 aarch64_extend_bitfield_pattern_p (rtx x)
6055 {
6056   rtx_code outer_code = GET_CODE (x);
6057   machine_mode outer_mode = GET_MODE (x);
6058
6059   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6060       && outer_mode != SImode && outer_mode != DImode)
6061     return NULL_RTX;
6062
6063   rtx inner = XEXP (x, 0);
6064   rtx_code inner_code = GET_CODE (inner);
6065   machine_mode inner_mode = GET_MODE (inner);
6066   rtx op = NULL_RTX;
6067
6068   switch (inner_code)
6069     {
6070       case ASHIFT:
6071         if (CONST_INT_P (XEXP (inner, 1))
6072             && (inner_mode == QImode || inner_mode == HImode))
6073           op = XEXP (inner, 0);
6074         break;
6075       case LSHIFTRT:
6076         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6077             && (inner_mode == QImode || inner_mode == HImode))
6078           op = XEXP (inner, 0);
6079         break;
6080       case ASHIFTRT:
6081         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6082             && (inner_mode == QImode || inner_mode == HImode))
6083           op = XEXP (inner, 0);
6084         break;
6085       default:
6086         break;
6087     }
6088
6089   return op;
6090 }
6091
6092 /* Return true if the mask and a shift amount from an RTX of the form
6093    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6094    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6095
6096 bool
6097 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6098 {
6099   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6100          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6101          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6102          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6103 }
6104
6105 /* Calculate the cost of calculating X, storing it in *COST.  Result
6106    is true if the total cost of the operation has now been calculated.  */
6107 static bool
6108 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6109                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6110 {
6111   rtx op0, op1, op2;
6112   const struct cpu_cost_table *extra_cost
6113     = aarch64_tune_params.insn_extra_cost;
6114   int code = GET_CODE (x);
6115
6116   /* By default, assume that everything has equivalent cost to the
6117      cheapest instruction.  Any additional costs are applied as a delta
6118      above this default.  */
6119   *cost = COSTS_N_INSNS (1);
6120
6121   switch (code)
6122     {
6123     case SET:
6124       /* The cost depends entirely on the operands to SET.  */
6125       *cost = 0;
6126       op0 = SET_DEST (x);
6127       op1 = SET_SRC (x);
6128
6129       switch (GET_CODE (op0))
6130         {
6131         case MEM:
6132           if (speed)
6133             {
6134               rtx address = XEXP (op0, 0);
6135               if (VECTOR_MODE_P (mode))
6136                 *cost += extra_cost->ldst.storev;
6137               else if (GET_MODE_CLASS (mode) == MODE_INT)
6138                 *cost += extra_cost->ldst.store;
6139               else if (mode == SFmode)
6140                 *cost += extra_cost->ldst.storef;
6141               else if (mode == DFmode)
6142                 *cost += extra_cost->ldst.stored;
6143
6144               *cost +=
6145                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6146                                                      0, speed));
6147             }
6148
6149           *cost += rtx_cost (op1, mode, SET, 1, speed);
6150           return true;
6151
6152         case SUBREG:
6153           if (! REG_P (SUBREG_REG (op0)))
6154             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6155
6156           /* Fall through.  */
6157         case REG:
6158           /* The cost is one per vector-register copied.  */
6159           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6160             {
6161               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6162                               / GET_MODE_SIZE (V4SImode);
6163               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6164             }
6165           /* const0_rtx is in general free, but we will use an
6166              instruction to set a register to 0.  */
6167           else if (REG_P (op1) || op1 == const0_rtx)
6168             {
6169               /* The cost is 1 per register copied.  */
6170               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6171                               / UNITS_PER_WORD;
6172               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6173             }
6174           else
6175             /* Cost is just the cost of the RHS of the set.  */
6176             *cost += rtx_cost (op1, mode, SET, 1, speed);
6177           return true;
6178
6179         case ZERO_EXTRACT:
6180         case SIGN_EXTRACT:
6181           /* Bit-field insertion.  Strip any redundant widening of
6182              the RHS to meet the width of the target.  */
6183           if (GET_CODE (op1) == SUBREG)
6184             op1 = SUBREG_REG (op1);
6185           if ((GET_CODE (op1) == ZERO_EXTEND
6186                || GET_CODE (op1) == SIGN_EXTEND)
6187               && CONST_INT_P (XEXP (op0, 1))
6188               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6189                   >= INTVAL (XEXP (op0, 1))))
6190             op1 = XEXP (op1, 0);
6191
6192           if (CONST_INT_P (op1))
6193             {
6194               /* MOV immediate is assumed to always be cheap.  */
6195               *cost = COSTS_N_INSNS (1);
6196             }
6197           else
6198             {
6199               /* BFM.  */
6200               if (speed)
6201                 *cost += extra_cost->alu.bfi;
6202               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6203             }
6204
6205           return true;
6206
6207         default:
6208           /* We can't make sense of this, assume default cost.  */
6209           *cost = COSTS_N_INSNS (1);
6210           return false;
6211         }
6212       return false;
6213
6214     case CONST_INT:
6215       /* If an instruction can incorporate a constant within the
6216          instruction, the instruction's expression avoids calling
6217          rtx_cost() on the constant.  If rtx_cost() is called on a
6218          constant, then it is usually because the constant must be
6219          moved into a register by one or more instructions.
6220
6221          The exception is constant 0, which can be expressed
6222          as XZR/WZR and is therefore free.  The exception to this is
6223          if we have (set (reg) (const0_rtx)) in which case we must cost
6224          the move.  However, we can catch that when we cost the SET, so
6225          we don't need to consider that here.  */
6226       if (x == const0_rtx)
6227         *cost = 0;
6228       else
6229         {
6230           /* To an approximation, building any other constant is
6231              proportionally expensive to the number of instructions
6232              required to build that constant.  This is true whether we
6233              are compiling for SPEED or otherwise.  */
6234           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6235                                  (NULL_RTX, x, false, mode));
6236         }
6237       return true;
6238
6239     case CONST_DOUBLE:
6240       if (speed)
6241         {
6242           /* mov[df,sf]_aarch64.  */
6243           if (aarch64_float_const_representable_p (x))
6244             /* FMOV (scalar immediate).  */
6245             *cost += extra_cost->fp[mode == DFmode].fpconst;
6246           else if (!aarch64_float_const_zero_rtx_p (x))
6247             {
6248               /* This will be a load from memory.  */
6249               if (mode == DFmode)
6250                 *cost += extra_cost->ldst.loadd;
6251               else
6252                 *cost += extra_cost->ldst.loadf;
6253             }
6254           else
6255             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
6256                or MOV v0.s[0], wzr - neither of which are modeled by the
6257                cost tables.  Just use the default cost.  */
6258             {
6259             }
6260         }
6261
6262       return true;
6263
6264     case MEM:
6265       if (speed)
6266         {
6267           /* For loads we want the base cost of a load, plus an
6268              approximation for the additional cost of the addressing
6269              mode.  */
6270           rtx address = XEXP (x, 0);
6271           if (VECTOR_MODE_P (mode))
6272             *cost += extra_cost->ldst.loadv;
6273           else if (GET_MODE_CLASS (mode) == MODE_INT)
6274             *cost += extra_cost->ldst.load;
6275           else if (mode == SFmode)
6276             *cost += extra_cost->ldst.loadf;
6277           else if (mode == DFmode)
6278             *cost += extra_cost->ldst.loadd;
6279
6280           *cost +=
6281                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6282                                                      0, speed));
6283         }
6284
6285       return true;
6286
6287     case NEG:
6288       op0 = XEXP (x, 0);
6289
6290       if (VECTOR_MODE_P (mode))
6291         {
6292           if (speed)
6293             {
6294               /* FNEG.  */
6295               *cost += extra_cost->vect.alu;
6296             }
6297           return false;
6298         }
6299
6300       if (GET_MODE_CLASS (mode) == MODE_INT)
6301         {
6302           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6303               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6304             {
6305               /* CSETM.  */
6306               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6307               return true;
6308             }
6309
6310           /* Cost this as SUB wzr, X.  */
6311           op0 = CONST0_RTX (mode);
6312           op1 = XEXP (x, 0);
6313           goto cost_minus;
6314         }
6315
6316       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6317         {
6318           /* Support (neg(fma...)) as a single instruction only if
6319              sign of zeros is unimportant.  This matches the decision
6320              making in aarch64.md.  */
6321           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6322             {
6323               /* FNMADD.  */
6324               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6325               return true;
6326             }
6327           if (GET_CODE (op0) == MULT)
6328             {
6329               /* FNMUL.  */
6330               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6331               return true;
6332             }
6333           if (speed)
6334             /* FNEG.  */
6335             *cost += extra_cost->fp[mode == DFmode].neg;
6336           return false;
6337         }
6338
6339       return false;
6340
6341     case CLRSB:
6342     case CLZ:
6343       if (speed)
6344         {
6345           if (VECTOR_MODE_P (mode))
6346             *cost += extra_cost->vect.alu;
6347           else
6348             *cost += extra_cost->alu.clz;
6349         }
6350
6351       return false;
6352
6353     case COMPARE:
6354       op0 = XEXP (x, 0);
6355       op1 = XEXP (x, 1);
6356
6357       if (op1 == const0_rtx
6358           && GET_CODE (op0) == AND)
6359         {
6360           x = op0;
6361           mode = GET_MODE (op0);
6362           goto cost_logic;
6363         }
6364
6365       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6366         {
6367           /* TODO: A write to the CC flags possibly costs extra, this
6368              needs encoding in the cost tables.  */
6369
6370           mode = GET_MODE (op0);
6371           /* ANDS.  */
6372           if (GET_CODE (op0) == AND)
6373             {
6374               x = op0;
6375               goto cost_logic;
6376             }
6377
6378           if (GET_CODE (op0) == PLUS)
6379             {
6380               /* ADDS (and CMN alias).  */
6381               x = op0;
6382               goto cost_plus;
6383             }
6384
6385           if (GET_CODE (op0) == MINUS)
6386             {
6387               /* SUBS.  */
6388               x = op0;
6389               goto cost_minus;
6390             }
6391
6392           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6393               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6394               && CONST_INT_P (XEXP (op0, 2)))
6395             {
6396               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6397                  Handle it here directly rather than going to cost_logic
6398                  since we know the immediate generated for the TST is valid
6399                  so we can avoid creating an intermediate rtx for it only
6400                  for costing purposes.  */
6401               if (speed)
6402                 *cost += extra_cost->alu.logical;
6403
6404               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6405                                  ZERO_EXTRACT, 0, speed);
6406               return true;
6407             }
6408
6409           if (GET_CODE (op1) == NEG)
6410             {
6411               /* CMN.  */
6412               if (speed)
6413                 *cost += extra_cost->alu.arith;
6414
6415               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6416               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6417               return true;
6418             }
6419
6420           /* CMP.
6421
6422              Compare can freely swap the order of operands, and
6423              canonicalization puts the more complex operation first.
6424              But the integer MINUS logic expects the shift/extend
6425              operation in op1.  */
6426           if (! (REG_P (op0)
6427                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6428           {
6429             op0 = XEXP (x, 1);
6430             op1 = XEXP (x, 0);
6431           }
6432           goto cost_minus;
6433         }
6434
6435       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6436         {
6437           /* FCMP.  */
6438           if (speed)
6439             *cost += extra_cost->fp[mode == DFmode].compare;
6440
6441           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6442             {
6443               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6444               /* FCMP supports constant 0.0 for no extra cost. */
6445               return true;
6446             }
6447           return false;
6448         }
6449
6450       if (VECTOR_MODE_P (mode))
6451         {
6452           /* Vector compare.  */
6453           if (speed)
6454             *cost += extra_cost->vect.alu;
6455
6456           if (aarch64_float_const_zero_rtx_p (op1))
6457             {
6458               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6459                  cost.  */
6460               return true;
6461             }
6462           return false;
6463         }
6464       return false;
6465
6466     case MINUS:
6467       {
6468         op0 = XEXP (x, 0);
6469         op1 = XEXP (x, 1);
6470
6471 cost_minus:
6472         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6473
6474         /* Detect valid immediates.  */
6475         if ((GET_MODE_CLASS (mode) == MODE_INT
6476              || (GET_MODE_CLASS (mode) == MODE_CC
6477                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6478             && CONST_INT_P (op1)
6479             && aarch64_uimm12_shift (INTVAL (op1)))
6480           {
6481             if (speed)
6482               /* SUB(S) (immediate).  */
6483               *cost += extra_cost->alu.arith;
6484             return true;
6485           }
6486
6487         /* Look for SUB (extended register).  */
6488         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6489           {
6490             if (speed)
6491               *cost += extra_cost->alu.extend_arith;
6492
6493             op1 = aarch64_strip_extend (op1);
6494             *cost += rtx_cost (op1, VOIDmode,
6495                                (enum rtx_code) GET_CODE (op1), 0, speed);
6496             return true;
6497           }
6498
6499         rtx new_op1 = aarch64_strip_extend (op1);
6500
6501         /* Cost this as an FMA-alike operation.  */
6502         if ((GET_CODE (new_op1) == MULT
6503              || aarch64_shift_p (GET_CODE (new_op1)))
6504             && code != COMPARE)
6505           {
6506             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6507                                             (enum rtx_code) code,
6508                                             speed);
6509             return true;
6510           }
6511
6512         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6513
6514         if (speed)
6515           {
6516             if (VECTOR_MODE_P (mode))
6517               {
6518                 /* Vector SUB.  */
6519                 *cost += extra_cost->vect.alu;
6520               }
6521             else if (GET_MODE_CLASS (mode) == MODE_INT)
6522               {
6523                 /* SUB(S).  */
6524                 *cost += extra_cost->alu.arith;
6525               }
6526             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6527               {
6528                 /* FSUB.  */
6529                 *cost += extra_cost->fp[mode == DFmode].addsub;
6530               }
6531           }
6532         return true;
6533       }
6534
6535     case PLUS:
6536       {
6537         rtx new_op0;
6538
6539         op0 = XEXP (x, 0);
6540         op1 = XEXP (x, 1);
6541
6542 cost_plus:
6543         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6544             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6545           {
6546             /* CSINC.  */
6547             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6548             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6549             return true;
6550           }
6551
6552         if (GET_MODE_CLASS (mode) == MODE_INT
6553             && CONST_INT_P (op1)
6554             && aarch64_uimm12_shift (INTVAL (op1)))
6555           {
6556             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6557
6558             if (speed)
6559               /* ADD (immediate).  */
6560               *cost += extra_cost->alu.arith;
6561             return true;
6562           }
6563
6564         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6565
6566         /* Look for ADD (extended register).  */
6567         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6568           {
6569             if (speed)
6570               *cost += extra_cost->alu.extend_arith;
6571
6572             op0 = aarch64_strip_extend (op0);
6573             *cost += rtx_cost (op0, VOIDmode,
6574                                (enum rtx_code) GET_CODE (op0), 0, speed);
6575             return true;
6576           }
6577
6578         /* Strip any extend, leave shifts behind as we will
6579            cost them through mult_cost.  */
6580         new_op0 = aarch64_strip_extend (op0);
6581
6582         if (GET_CODE (new_op0) == MULT
6583             || aarch64_shift_p (GET_CODE (new_op0)))
6584           {
6585             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6586                                             speed);
6587             return true;
6588           }
6589
6590         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6591
6592         if (speed)
6593           {
6594             if (VECTOR_MODE_P (mode))
6595               {
6596                 /* Vector ADD.  */
6597                 *cost += extra_cost->vect.alu;
6598               }
6599             else if (GET_MODE_CLASS (mode) == MODE_INT)
6600               {
6601                 /* ADD.  */
6602                 *cost += extra_cost->alu.arith;
6603               }
6604             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6605               {
6606                 /* FADD.  */
6607                 *cost += extra_cost->fp[mode == DFmode].addsub;
6608               }
6609           }
6610         return true;
6611       }
6612
6613     case BSWAP:
6614       *cost = COSTS_N_INSNS (1);
6615
6616       if (speed)
6617         {
6618           if (VECTOR_MODE_P (mode))
6619             *cost += extra_cost->vect.alu;
6620           else
6621             *cost += extra_cost->alu.rev;
6622         }
6623       return false;
6624
6625     case IOR:
6626       if (aarch_rev16_p (x))
6627         {
6628           *cost = COSTS_N_INSNS (1);
6629
6630           if (speed)
6631             {
6632               if (VECTOR_MODE_P (mode))
6633                 *cost += extra_cost->vect.alu;
6634               else
6635                 *cost += extra_cost->alu.rev;
6636             }
6637           return true;
6638         }
6639
6640       if (aarch64_extr_rtx_p (x, &op0, &op1))
6641         {
6642           *cost += rtx_cost (op0, mode, IOR, 0, speed);
6643           *cost += rtx_cost (op1, mode, IOR, 1, speed);
6644           if (speed)
6645             *cost += extra_cost->alu.shift;
6646
6647           return true;
6648         }
6649     /* Fall through.  */
6650     case XOR:
6651     case AND:
6652     cost_logic:
6653       op0 = XEXP (x, 0);
6654       op1 = XEXP (x, 1);
6655
6656       if (VECTOR_MODE_P (mode))
6657         {
6658           if (speed)
6659             *cost += extra_cost->vect.alu;
6660           return true;
6661         }
6662
6663       if (code == AND
6664           && GET_CODE (op0) == MULT
6665           && CONST_INT_P (XEXP (op0, 1))
6666           && CONST_INT_P (op1)
6667           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6668                                INTVAL (op1)) != 0)
6669         {
6670           /* This is a UBFM/SBFM.  */
6671           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6672           if (speed)
6673             *cost += extra_cost->alu.bfx;
6674           return true;
6675         }
6676
6677       if (GET_MODE_CLASS (mode) == MODE_INT)
6678         {
6679           if (CONST_INT_P (op1))
6680             {
6681               /* We have a mask + shift version of a UBFIZ
6682                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
6683               if (GET_CODE (op0) == ASHIFT
6684                   && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
6685                                                           XEXP (op0, 1)))
6686                 {
6687                   *cost += rtx_cost (XEXP (op0, 0), mode,
6688                                      (enum rtx_code) code, 0, speed);
6689                   if (speed)
6690                     *cost += extra_cost->alu.bfx;
6691
6692                   return true;
6693                 }
6694               else if (aarch64_bitmask_imm (INTVAL (op1), mode))
6695                 {
6696                 /* We possibly get the immediate for free, this is not
6697                    modelled.  */
6698                   *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6699                   if (speed)
6700                     *cost += extra_cost->alu.logical;
6701
6702                   return true;
6703                 }
6704             }
6705           else
6706             {
6707               rtx new_op0 = op0;
6708
6709               /* Handle ORN, EON, or BIC.  */
6710               if (GET_CODE (op0) == NOT)
6711                 op0 = XEXP (op0, 0);
6712
6713               new_op0 = aarch64_strip_shift (op0);
6714
6715               /* If we had a shift on op0 then this is a logical-shift-
6716                  by-register/immediate operation.  Otherwise, this is just
6717                  a logical operation.  */
6718               if (speed)
6719                 {
6720                   if (new_op0 != op0)
6721                     {
6722                       /* Shift by immediate.  */
6723                       if (CONST_INT_P (XEXP (op0, 1)))
6724                         *cost += extra_cost->alu.log_shift;
6725                       else
6726                         *cost += extra_cost->alu.log_shift_reg;
6727                     }
6728                   else
6729                     *cost += extra_cost->alu.logical;
6730                 }
6731
6732               /* In both cases we want to cost both operands.  */
6733               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6734               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6735
6736               return true;
6737             }
6738         }
6739       return false;
6740
6741     case NOT:
6742       x = XEXP (x, 0);
6743       op0 = aarch64_strip_shift (x);
6744
6745       if (VECTOR_MODE_P (mode))
6746         {
6747           /* Vector NOT.  */
6748           *cost += extra_cost->vect.alu;
6749           return false;
6750         }
6751
6752       /* MVN-shifted-reg.  */
6753       if (op0 != x)
6754         {
6755           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6756
6757           if (speed)
6758             *cost += extra_cost->alu.log_shift;
6759
6760           return true;
6761         }
6762       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6763          Handle the second form here taking care that 'a' in the above can
6764          be a shift.  */
6765       else if (GET_CODE (op0) == XOR)
6766         {
6767           rtx newop0 = XEXP (op0, 0);
6768           rtx newop1 = XEXP (op0, 1);
6769           rtx op0_stripped = aarch64_strip_shift (newop0);
6770
6771           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6772           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6773
6774           if (speed)
6775             {
6776               if (op0_stripped != newop0)
6777                 *cost += extra_cost->alu.log_shift;
6778               else
6779                 *cost += extra_cost->alu.logical;
6780             }
6781
6782           return true;
6783         }
6784       /* MVN.  */
6785       if (speed)
6786         *cost += extra_cost->alu.logical;
6787
6788       return false;
6789
6790     case ZERO_EXTEND:
6791
6792       op0 = XEXP (x, 0);
6793       /* If a value is written in SI mode, then zero extended to DI
6794          mode, the operation will in general be free as a write to
6795          a 'w' register implicitly zeroes the upper bits of an 'x'
6796          register.  However, if this is
6797
6798            (set (reg) (zero_extend (reg)))
6799
6800          we must cost the explicit register move.  */
6801       if (mode == DImode
6802           && GET_MODE (op0) == SImode
6803           && outer == SET)
6804         {
6805           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6806
6807         /* If OP_COST is non-zero, then the cost of the zero extend
6808            is effectively the cost of the inner operation.  Otherwise
6809            we have a MOV instruction and we take the cost from the MOV
6810            itself.  This is true independently of whether we are
6811            optimizing for space or time.  */
6812           if (op_cost)
6813             *cost = op_cost;
6814
6815           return true;
6816         }
6817       else if (MEM_P (op0))
6818         {
6819           /* All loads can zero extend to any size for free.  */
6820           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6821           return true;
6822         }
6823
6824       op0 = aarch64_extend_bitfield_pattern_p (x);
6825       if (op0)
6826         {
6827           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
6828           if (speed)
6829             *cost += extra_cost->alu.bfx;
6830           return true;
6831         }
6832
6833       if (speed)
6834         {
6835           if (VECTOR_MODE_P (mode))
6836             {
6837               /* UMOV.  */
6838               *cost += extra_cost->vect.alu;
6839             }
6840           else
6841             {
6842               /* We generate an AND instead of UXTB/UXTH.  */
6843               *cost += extra_cost->alu.logical;
6844             }
6845         }
6846       return false;
6847
6848     case SIGN_EXTEND:
6849       if (MEM_P (XEXP (x, 0)))
6850         {
6851           /* LDRSH.  */
6852           if (speed)
6853             {
6854               rtx address = XEXP (XEXP (x, 0), 0);
6855               *cost += extra_cost->ldst.load_sign_extend;
6856
6857               *cost +=
6858                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6859                                                      0, speed));
6860             }
6861           return true;
6862         }
6863
6864       op0 = aarch64_extend_bitfield_pattern_p (x);
6865       if (op0)
6866         {
6867           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
6868           if (speed)
6869             *cost += extra_cost->alu.bfx;
6870           return true;
6871         }
6872
6873       if (speed)
6874         {
6875           if (VECTOR_MODE_P (mode))
6876             *cost += extra_cost->vect.alu;
6877           else
6878             *cost += extra_cost->alu.extend;
6879         }
6880       return false;
6881
6882     case ASHIFT:
6883       op0 = XEXP (x, 0);
6884       op1 = XEXP (x, 1);
6885
6886       if (CONST_INT_P (op1))
6887         {
6888           if (speed)
6889             {
6890               if (VECTOR_MODE_P (mode))
6891                 {
6892                   /* Vector shift (immediate).  */
6893                   *cost += extra_cost->vect.alu;
6894                 }
6895               else
6896                 {
6897                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6898                      aliases.  */
6899                   *cost += extra_cost->alu.shift;
6900                 }
6901             }
6902
6903           /* We can incorporate zero/sign extend for free.  */
6904           if (GET_CODE (op0) == ZERO_EXTEND
6905               || GET_CODE (op0) == SIGN_EXTEND)
6906             op0 = XEXP (op0, 0);
6907
6908           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
6909           return true;
6910         }
6911       else
6912         {
6913           if (speed)
6914             {
6915               if (VECTOR_MODE_P (mode))
6916                 {
6917                   /* Vector shift (register).  */
6918                   *cost += extra_cost->vect.alu;
6919                 }
6920               else
6921                 {
6922                   /* LSLV.  */
6923                   *cost += extra_cost->alu.shift_reg;
6924                 }
6925             }
6926           return false;  /* All arguments need to be in registers.  */
6927         }
6928
6929     case ROTATE:
6930     case ROTATERT:
6931     case LSHIFTRT:
6932     case ASHIFTRT:
6933       op0 = XEXP (x, 0);
6934       op1 = XEXP (x, 1);
6935
6936       if (CONST_INT_P (op1))
6937         {
6938           /* ASR (immediate) and friends.  */
6939           if (speed)
6940             {
6941               if (VECTOR_MODE_P (mode))
6942                 *cost += extra_cost->vect.alu;
6943               else
6944                 *cost += extra_cost->alu.shift;
6945             }
6946
6947           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6948           return true;
6949         }
6950       else
6951         {
6952
6953           /* ASR (register) and friends.  */
6954           if (speed)
6955             {
6956               if (VECTOR_MODE_P (mode))
6957                 *cost += extra_cost->vect.alu;
6958               else
6959                 *cost += extra_cost->alu.shift_reg;
6960             }
6961           return false;  /* All arguments need to be in registers.  */
6962         }
6963
6964     case SYMBOL_REF:
6965
6966       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
6967           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
6968         {
6969           /* LDR.  */
6970           if (speed)
6971             *cost += extra_cost->ldst.load;
6972         }
6973       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6974                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6975         {
6976           /* ADRP, followed by ADD.  */
6977           *cost += COSTS_N_INSNS (1);
6978           if (speed)
6979             *cost += 2 * extra_cost->alu.arith;
6980         }
6981       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6982                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6983         {
6984           /* ADR.  */
6985           if (speed)
6986             *cost += extra_cost->alu.arith;
6987         }
6988
6989       if (flag_pic)
6990         {
6991           /* One extra load instruction, after accessing the GOT.  */
6992           *cost += COSTS_N_INSNS (1);
6993           if (speed)
6994             *cost += extra_cost->ldst.load;
6995         }
6996       return true;
6997
6998     case HIGH:
6999     case LO_SUM:
7000       /* ADRP/ADD (immediate).  */
7001       if (speed)
7002         *cost += extra_cost->alu.arith;
7003       return true;
7004
7005     case ZERO_EXTRACT:
7006     case SIGN_EXTRACT:
7007       /* UBFX/SBFX.  */
7008       if (speed)
7009         {
7010           if (VECTOR_MODE_P (mode))
7011             *cost += extra_cost->vect.alu;
7012           else
7013             *cost += extra_cost->alu.bfx;
7014         }
7015
7016       /* We can trust that the immediates used will be correct (there
7017          are no by-register forms), so we need only cost op0.  */
7018       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7019       return true;
7020
7021     case MULT:
7022       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7023       /* aarch64_rtx_mult_cost always handles recursion to its
7024          operands.  */
7025       return true;
7026
7027     case MOD:
7028     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7029        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7030        an unconditional negate.  This case should only ever be reached through
7031        the set_smod_pow2_cheap check in expmed.c.  */
7032       if (CONST_INT_P (XEXP (x, 1))
7033           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7034           && (mode == SImode || mode == DImode))
7035         {
7036           /* We expand to 4 instructions.  Reset the baseline.  */
7037           *cost = COSTS_N_INSNS (4);
7038
7039           if (speed)
7040             *cost += 2 * extra_cost->alu.logical
7041                      + 2 * extra_cost->alu.arith;
7042
7043           return true;
7044         }
7045
7046     /* Fall-through.  */
7047     case UMOD:
7048       if (speed)
7049         {
7050           if (VECTOR_MODE_P (mode))
7051             *cost += extra_cost->vect.alu;
7052           else if (GET_MODE_CLASS (mode) == MODE_INT)
7053             *cost += (extra_cost->mult[mode == DImode].add
7054                       + extra_cost->mult[mode == DImode].idiv);
7055           else if (mode == DFmode)
7056             *cost += (extra_cost->fp[1].mult
7057                       + extra_cost->fp[1].div);
7058           else if (mode == SFmode)
7059             *cost += (extra_cost->fp[0].mult
7060                       + extra_cost->fp[0].div);
7061         }
7062       return false;  /* All arguments need to be in registers.  */
7063
7064     case DIV:
7065     case UDIV:
7066     case SQRT:
7067       if (speed)
7068         {
7069           if (VECTOR_MODE_P (mode))
7070             *cost += extra_cost->vect.alu;
7071           else if (GET_MODE_CLASS (mode) == MODE_INT)
7072             /* There is no integer SQRT, so only DIV and UDIV can get
7073                here.  */
7074             *cost += extra_cost->mult[mode == DImode].idiv;
7075           else
7076             *cost += extra_cost->fp[mode == DFmode].div;
7077         }
7078       return false;  /* All arguments need to be in registers.  */
7079
7080     case IF_THEN_ELSE:
7081       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7082                                          XEXP (x, 2), cost, speed);
7083
7084     case EQ:
7085     case NE:
7086     case GT:
7087     case GTU:
7088     case LT:
7089     case LTU:
7090     case GE:
7091     case GEU:
7092     case LE:
7093     case LEU:
7094
7095       return false; /* All arguments must be in registers.  */
7096
7097     case FMA:
7098       op0 = XEXP (x, 0);
7099       op1 = XEXP (x, 1);
7100       op2 = XEXP (x, 2);
7101
7102       if (speed)
7103         {
7104           if (VECTOR_MODE_P (mode))
7105             *cost += extra_cost->vect.alu;
7106           else
7107             *cost += extra_cost->fp[mode == DFmode].fma;
7108         }
7109
7110       /* FMSUB, FNMADD, and FNMSUB are free.  */
7111       if (GET_CODE (op0) == NEG)
7112         op0 = XEXP (op0, 0);
7113
7114       if (GET_CODE (op2) == NEG)
7115         op2 = XEXP (op2, 0);
7116
7117       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7118          and the by-element operand as operand 0.  */
7119       if (GET_CODE (op1) == NEG)
7120         op1 = XEXP (op1, 0);
7121
7122       /* Catch vector-by-element operations.  The by-element operand can
7123          either be (vec_duplicate (vec_select (x))) or just
7124          (vec_select (x)), depending on whether we are multiplying by
7125          a vector or a scalar.
7126
7127          Canonicalization is not very good in these cases, FMA4 will put the
7128          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7129       if (GET_CODE (op0) == VEC_DUPLICATE)
7130         op0 = XEXP (op0, 0);
7131       else if (GET_CODE (op1) == VEC_DUPLICATE)
7132         op1 = XEXP (op1, 0);
7133
7134       if (GET_CODE (op0) == VEC_SELECT)
7135         op0 = XEXP (op0, 0);
7136       else if (GET_CODE (op1) == VEC_SELECT)
7137         op1 = XEXP (op1, 0);
7138
7139       /* If the remaining parameters are not registers,
7140          get the cost to put them into registers.  */
7141       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7142       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7143       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7144       return true;
7145
7146     case FLOAT:
7147     case UNSIGNED_FLOAT:
7148       if (speed)
7149         *cost += extra_cost->fp[mode == DFmode].fromint;
7150       return false;
7151
7152     case FLOAT_EXTEND:
7153       if (speed)
7154         {
7155           if (VECTOR_MODE_P (mode))
7156             {
7157               /*Vector truncate.  */
7158               *cost += extra_cost->vect.alu;
7159             }
7160           else
7161             *cost += extra_cost->fp[mode == DFmode].widen;
7162         }
7163       return false;
7164
7165     case FLOAT_TRUNCATE:
7166       if (speed)
7167         {
7168           if (VECTOR_MODE_P (mode))
7169             {
7170               /*Vector conversion.  */
7171               *cost += extra_cost->vect.alu;
7172             }
7173           else
7174             *cost += extra_cost->fp[mode == DFmode].narrow;
7175         }
7176       return false;
7177
7178     case FIX:
7179     case UNSIGNED_FIX:
7180       x = XEXP (x, 0);
7181       /* Strip the rounding part.  They will all be implemented
7182          by the fcvt* family of instructions anyway.  */
7183       if (GET_CODE (x) == UNSPEC)
7184         {
7185           unsigned int uns_code = XINT (x, 1);
7186
7187           if (uns_code == UNSPEC_FRINTA
7188               || uns_code == UNSPEC_FRINTM
7189               || uns_code == UNSPEC_FRINTN
7190               || uns_code == UNSPEC_FRINTP
7191               || uns_code == UNSPEC_FRINTZ)
7192             x = XVECEXP (x, 0, 0);
7193         }
7194
7195       if (speed)
7196         {
7197           if (VECTOR_MODE_P (mode))
7198             *cost += extra_cost->vect.alu;
7199           else
7200             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7201         }
7202
7203       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7204          fixed-point fcvt.  */
7205       if (GET_CODE (x) == MULT
7206           && ((VECTOR_MODE_P (mode)
7207                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7208               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7209         {
7210           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7211                              0, speed);
7212           return true;
7213         }
7214
7215       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7216       return true;
7217
7218     case ABS:
7219       if (VECTOR_MODE_P (mode))
7220         {
7221           /* ABS (vector).  */
7222           if (speed)
7223             *cost += extra_cost->vect.alu;
7224         }
7225       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7226         {
7227           op0 = XEXP (x, 0);
7228
7229           /* FABD, which is analogous to FADD.  */
7230           if (GET_CODE (op0) == MINUS)
7231             {
7232               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7233               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7234               if (speed)
7235                 *cost += extra_cost->fp[mode == DFmode].addsub;
7236
7237               return true;
7238             }
7239           /* Simple FABS is analogous to FNEG.  */
7240           if (speed)
7241             *cost += extra_cost->fp[mode == DFmode].neg;
7242         }
7243       else
7244         {
7245           /* Integer ABS will either be split to
7246              two arithmetic instructions, or will be an ABS
7247              (scalar), which we don't model.  */
7248           *cost = COSTS_N_INSNS (2);
7249           if (speed)
7250             *cost += 2 * extra_cost->alu.arith;
7251         }
7252       return false;
7253
7254     case SMAX:
7255     case SMIN:
7256       if (speed)
7257         {
7258           if (VECTOR_MODE_P (mode))
7259             *cost += extra_cost->vect.alu;
7260           else
7261             {
7262               /* FMAXNM/FMINNM/FMAX/FMIN.
7263                  TODO: This may not be accurate for all implementations, but
7264                  we do not model this in the cost tables.  */
7265               *cost += extra_cost->fp[mode == DFmode].addsub;
7266             }
7267         }
7268       return false;
7269
7270     case UNSPEC:
7271       /* The floating point round to integer frint* instructions.  */
7272       if (aarch64_frint_unspec_p (XINT (x, 1)))
7273         {
7274           if (speed)
7275             *cost += extra_cost->fp[mode == DFmode].roundint;
7276
7277           return false;
7278         }
7279
7280       if (XINT (x, 1) == UNSPEC_RBIT)
7281         {
7282           if (speed)
7283             *cost += extra_cost->alu.rev;
7284
7285           return false;
7286         }
7287       break;
7288
7289     case TRUNCATE:
7290
7291       /* Decompose <su>muldi3_highpart.  */
7292       if (/* (truncate:DI  */
7293           mode == DImode
7294           /*   (lshiftrt:TI  */
7295           && GET_MODE (XEXP (x, 0)) == TImode
7296           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7297           /*      (mult:TI  */
7298           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7299           /*        (ANY_EXTEND:TI (reg:DI))
7300                     (ANY_EXTEND:TI (reg:DI)))  */
7301           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7302                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7303               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7304                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7305           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7306           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7307           /*     (const_int 64)  */
7308           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7309           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7310         {
7311           /* UMULH/SMULH.  */
7312           if (speed)
7313             *cost += extra_cost->mult[mode == DImode].extend;
7314           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7315                              mode, MULT, 0, speed);
7316           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7317                              mode, MULT, 1, speed);
7318           return true;
7319         }
7320
7321       /* Fall through.  */
7322     default:
7323       break;
7324     }
7325
7326   if (dump_file && (dump_flags & TDF_DETAILS))
7327     fprintf (dump_file,
7328       "\nFailed to cost RTX.  Assuming default cost.\n");
7329
7330   return true;
7331 }
7332
7333 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7334    calculated for X.  This cost is stored in *COST.  Returns true
7335    if the total cost of X was calculated.  */
7336 static bool
7337 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7338                    int param, int *cost, bool speed)
7339 {
7340   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7341
7342   if (dump_file && (dump_flags & TDF_DETAILS))
7343     {
7344       print_rtl_single (dump_file, x);
7345       fprintf (dump_file, "\n%s cost: %d (%s)\n",
7346                speed ? "Hot" : "Cold",
7347                *cost, result ? "final" : "partial");
7348     }
7349
7350   return result;
7351 }
7352
7353 static int
7354 aarch64_register_move_cost (machine_mode mode,
7355                             reg_class_t from_i, reg_class_t to_i)
7356 {
7357   enum reg_class from = (enum reg_class) from_i;
7358   enum reg_class to = (enum reg_class) to_i;
7359   const struct cpu_regmove_cost *regmove_cost
7360     = aarch64_tune_params.regmove_cost;
7361
7362   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
7363   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7364     to = GENERAL_REGS;
7365
7366   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7367     from = GENERAL_REGS;
7368
7369   /* Moving between GPR and stack cost is the same as GP2GP.  */
7370   if ((from == GENERAL_REGS && to == STACK_REG)
7371       || (to == GENERAL_REGS && from == STACK_REG))
7372     return regmove_cost->GP2GP;
7373
7374   /* To/From the stack register, we move via the gprs.  */
7375   if (to == STACK_REG || from == STACK_REG)
7376     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7377             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7378
7379   if (GET_MODE_SIZE (mode) == 16)
7380     {
7381       /* 128-bit operations on general registers require 2 instructions.  */
7382       if (from == GENERAL_REGS && to == GENERAL_REGS)
7383         return regmove_cost->GP2GP * 2;
7384       else if (from == GENERAL_REGS)
7385         return regmove_cost->GP2FP * 2;
7386       else if (to == GENERAL_REGS)
7387         return regmove_cost->FP2GP * 2;
7388
7389       /* When AdvSIMD instructions are disabled it is not possible to move
7390          a 128-bit value directly between Q registers.  This is handled in
7391          secondary reload.  A general register is used as a scratch to move
7392          the upper DI value and the lower DI value is moved directly,
7393          hence the cost is the sum of three moves. */
7394       if (! TARGET_SIMD)
7395         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7396
7397       return regmove_cost->FP2FP;
7398     }
7399
7400   if (from == GENERAL_REGS && to == GENERAL_REGS)
7401     return regmove_cost->GP2GP;
7402   else if (from == GENERAL_REGS)
7403     return regmove_cost->GP2FP;
7404   else if (to == GENERAL_REGS)
7405     return regmove_cost->FP2GP;
7406
7407   return regmove_cost->FP2FP;
7408 }
7409
7410 static int
7411 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7412                           reg_class_t rclass ATTRIBUTE_UNUSED,
7413                           bool in ATTRIBUTE_UNUSED)
7414 {
7415   return aarch64_tune_params.memmov_cost;
7416 }
7417
7418 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7419    to optimize 1.0/sqrt.  */
7420
7421 static bool
7422 use_rsqrt_p (machine_mode mode)
7423 {
7424   return (!flag_trapping_math
7425           && flag_unsafe_math_optimizations
7426           && ((aarch64_tune_params.approx_modes->recip_sqrt
7427                & AARCH64_APPROX_MODE (mode))
7428               || flag_mrecip_low_precision_sqrt));
7429 }
7430
7431 /* Function to decide when to use the approximate reciprocal square root
7432    builtin.  */
7433
7434 static tree
7435 aarch64_builtin_reciprocal (tree fndecl)
7436 {
7437   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7438
7439   if (!use_rsqrt_p (mode))
7440     return NULL_TREE;
7441   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7442 }
7443
7444 typedef rtx (*rsqrte_type) (rtx, rtx);
7445
7446 /* Select reciprocal square root initial estimate insn depending on machine
7447    mode.  */
7448
7449 static rsqrte_type
7450 get_rsqrte_type (machine_mode mode)
7451 {
7452   switch (mode)
7453   {
7454     case DFmode:   return gen_aarch64_rsqrtedf;
7455     case SFmode:   return gen_aarch64_rsqrtesf;
7456     case V2DFmode: return gen_aarch64_rsqrtev2df;
7457     case V2SFmode: return gen_aarch64_rsqrtev2sf;
7458     case V4SFmode: return gen_aarch64_rsqrtev4sf;
7459     default: gcc_unreachable ();
7460   }
7461 }
7462
7463 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7464
7465 /* Select reciprocal square root series step insn depending on machine mode.  */
7466
7467 static rsqrts_type
7468 get_rsqrts_type (machine_mode mode)
7469 {
7470   switch (mode)
7471   {
7472     case DFmode:   return gen_aarch64_rsqrtsdf;
7473     case SFmode:   return gen_aarch64_rsqrtssf;
7474     case V2DFmode: return gen_aarch64_rsqrtsv2df;
7475     case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7476     case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7477     default: gcc_unreachable ();
7478   }
7479 }
7480
7481 /* Emit instruction sequence to compute either the approximate square root
7482    or its approximate reciprocal, depending on the flag RECP, and return
7483    whether the sequence was emitted or not.  */
7484
7485 bool
7486 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7487 {
7488   machine_mode mode = GET_MODE (dst);
7489
7490   if (GET_MODE_INNER (mode) == HFmode)
7491     return false;
7492
7493   machine_mode mmsk = mode_for_vector
7494                         (int_mode_for_mode (GET_MODE_INNER (mode)),
7495                          GET_MODE_NUNITS (mode));
7496   bool use_approx_sqrt_p = (!recp
7497                             && (flag_mlow_precision_sqrt
7498                                 || (aarch64_tune_params.approx_modes->sqrt
7499                                     & AARCH64_APPROX_MODE (mode))));
7500   bool use_approx_rsqrt_p = (recp
7501                              && (flag_mrecip_low_precision_sqrt
7502                                  || (aarch64_tune_params.approx_modes->recip_sqrt
7503                                      & AARCH64_APPROX_MODE (mode))));
7504
7505   if (!flag_finite_math_only
7506       || flag_trapping_math
7507       || !flag_unsafe_math_optimizations
7508       || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7509       || optimize_function_for_size_p (cfun))
7510     return false;
7511
7512   rtx xmsk = gen_reg_rtx (mmsk);
7513   if (!recp)
7514     /* When calculating the approximate square root, compare the argument with
7515        0.0 and create a mask.  */
7516     emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7517                                                           CONST0_RTX (mode)))));
7518
7519   /* Estimate the approximate reciprocal square root.  */
7520   rtx xdst = gen_reg_rtx (mode);
7521   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7522
7523   /* Iterate over the series twice for SF and thrice for DF.  */
7524   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7525
7526   /* Optionally iterate over the series once less for faster performance
7527      while sacrificing the accuracy.  */
7528   if ((recp && flag_mrecip_low_precision_sqrt)
7529       || (!recp && flag_mlow_precision_sqrt))
7530     iterations--;
7531
7532   /* Iterate over the series to calculate the approximate reciprocal square
7533      root.  */
7534   rtx x1 = gen_reg_rtx (mode);
7535   while (iterations--)
7536     {
7537       rtx x2 = gen_reg_rtx (mode);
7538       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7539
7540       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7541
7542       if (iterations > 0)
7543         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7544     }
7545
7546   if (!recp)
7547     {
7548       /* Qualify the approximate reciprocal square root when the argument is
7549          0.0 by squashing the intermediary result to 0.0.  */
7550       rtx xtmp = gen_reg_rtx (mmsk);
7551       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7552                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
7553       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7554
7555       /* Calculate the approximate square root.  */
7556       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7557     }
7558
7559   /* Finalize the approximation.  */
7560   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
7561
7562   return true;
7563 }
7564
7565 typedef rtx (*recpe_type) (rtx, rtx);
7566
7567 /* Select reciprocal initial estimate insn depending on machine mode.  */
7568
7569 static recpe_type
7570 get_recpe_type (machine_mode mode)
7571 {
7572   switch (mode)
7573   {
7574     case SFmode:   return (gen_aarch64_frecpesf);
7575     case V2SFmode: return (gen_aarch64_frecpev2sf);
7576     case V4SFmode: return (gen_aarch64_frecpev4sf);
7577     case DFmode:   return (gen_aarch64_frecpedf);
7578     case V2DFmode: return (gen_aarch64_frecpev2df);
7579     default:       gcc_unreachable ();
7580   }
7581 }
7582
7583 typedef rtx (*recps_type) (rtx, rtx, rtx);
7584
7585 /* Select reciprocal series step insn depending on machine mode.  */
7586
7587 static recps_type
7588 get_recps_type (machine_mode mode)
7589 {
7590   switch (mode)
7591   {
7592     case SFmode:   return (gen_aarch64_frecpssf);
7593     case V2SFmode: return (gen_aarch64_frecpsv2sf);
7594     case V4SFmode: return (gen_aarch64_frecpsv4sf);
7595     case DFmode:   return (gen_aarch64_frecpsdf);
7596     case V2DFmode: return (gen_aarch64_frecpsv2df);
7597     default:       gcc_unreachable ();
7598   }
7599 }
7600
7601 /* Emit the instruction sequence to compute the approximation for the division
7602    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
7603
7604 bool
7605 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
7606 {
7607   machine_mode mode = GET_MODE (quo);
7608
7609   if (GET_MODE_INNER (mode) == HFmode)
7610     return false;
7611
7612   bool use_approx_division_p = (flag_mlow_precision_div
7613                                 || (aarch64_tune_params.approx_modes->division
7614                                     & AARCH64_APPROX_MODE (mode)));
7615
7616   if (!flag_finite_math_only
7617       || flag_trapping_math
7618       || !flag_unsafe_math_optimizations
7619       || optimize_function_for_size_p (cfun)
7620       || !use_approx_division_p)
7621     return false;
7622
7623   /* Estimate the approximate reciprocal.  */
7624   rtx xrcp = gen_reg_rtx (mode);
7625   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
7626
7627   /* Iterate over the series twice for SF and thrice for DF.  */
7628   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7629
7630   /* Optionally iterate over the series once less for faster performance,
7631      while sacrificing the accuracy.  */
7632   if (flag_mlow_precision_div)
7633     iterations--;
7634
7635   /* Iterate over the series to calculate the approximate reciprocal.  */
7636   rtx xtmp = gen_reg_rtx (mode);
7637   while (iterations--)
7638     {
7639       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
7640
7641       if (iterations > 0)
7642         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
7643     }
7644
7645   if (num != CONST1_RTX (mode))
7646     {
7647       /* As the approximate reciprocal of DEN is already calculated, only
7648          calculate the approximate division when NUM is not 1.0.  */
7649       rtx xnum = force_reg (mode, num);
7650       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
7651     }
7652
7653   /* Finalize the approximation.  */
7654   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
7655   return true;
7656 }
7657
7658 /* Return the number of instructions that can be issued per cycle.  */
7659 static int
7660 aarch64_sched_issue_rate (void)
7661 {
7662   return aarch64_tune_params.issue_rate;
7663 }
7664
7665 static int
7666 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7667 {
7668   int issue_rate = aarch64_sched_issue_rate ();
7669
7670   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7671 }
7672
7673
7674 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7675    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
7676    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
7677
7678 static int
7679 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
7680                                                     int ready_index)
7681 {
7682   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
7683 }
7684
7685
7686 /* Vectorizer cost model target hooks.  */
7687
7688 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
7689 static int
7690 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7691                                     tree vectype,
7692                                     int misalign ATTRIBUTE_UNUSED)
7693 {
7694   unsigned elements;
7695
7696   switch (type_of_cost)
7697     {
7698       case scalar_stmt:
7699         return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7700
7701       case scalar_load:
7702         return aarch64_tune_params.vec_costs->scalar_load_cost;
7703
7704       case scalar_store:
7705         return aarch64_tune_params.vec_costs->scalar_store_cost;
7706
7707       case vector_stmt:
7708         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7709
7710       case vector_load:
7711         return aarch64_tune_params.vec_costs->vec_align_load_cost;
7712
7713       case vector_store:
7714         return aarch64_tune_params.vec_costs->vec_store_cost;
7715
7716       case vec_to_scalar:
7717         return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7718
7719       case scalar_to_vec:
7720         return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7721
7722       case unaligned_load:
7723         return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7724
7725       case unaligned_store:
7726         return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7727
7728       case cond_branch_taken:
7729         return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7730
7731       case cond_branch_not_taken:
7732         return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7733
7734       case vec_perm:
7735         return aarch64_tune_params.vec_costs->vec_permute_cost;
7736
7737       case vec_promote_demote:
7738         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7739
7740       case vec_construct:
7741         elements = TYPE_VECTOR_SUBPARTS (vectype);
7742         return elements / 2 + 1;
7743
7744       default:
7745         gcc_unreachable ();
7746     }
7747 }
7748
7749 /* Implement targetm.vectorize.add_stmt_cost.  */
7750 static unsigned
7751 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7752                        struct _stmt_vec_info *stmt_info, int misalign,
7753                        enum vect_cost_model_location where)
7754 {
7755   unsigned *cost = (unsigned *) data;
7756   unsigned retval = 0;
7757
7758   if (flag_vect_cost_model)
7759     {
7760       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7761       int stmt_cost =
7762             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7763
7764       /* Statements in an inner loop relative to the loop being
7765          vectorized are weighted more heavily.  The value here is
7766          arbitrary and could potentially be improved with analysis.  */
7767       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7768         count *= 50; /*  FIXME  */
7769
7770       retval = (unsigned) (count * stmt_cost);
7771       cost[where] += retval;
7772     }
7773
7774   return retval;
7775 }
7776
7777 static void initialize_aarch64_code_model (struct gcc_options *);
7778
7779 /* Parse the TO_PARSE string and put the architecture struct that it
7780    selects into RES and the architectural features into ISA_FLAGS.
7781    Return an aarch64_parse_opt_result describing the parse result.
7782    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
7783
7784 static enum aarch64_parse_opt_result
7785 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7786                     unsigned long *isa_flags)
7787 {
7788   char *ext;
7789   const struct processor *arch;
7790   char *str = (char *) alloca (strlen (to_parse) + 1);
7791   size_t len;
7792
7793   strcpy (str, to_parse);
7794
7795   ext = strchr (str, '+');
7796
7797   if (ext != NULL)
7798     len = ext - str;
7799   else
7800     len = strlen (str);
7801
7802   if (len == 0)
7803     return AARCH64_PARSE_MISSING_ARG;
7804
7805
7806   /* Loop through the list of supported ARCHes to find a match.  */
7807   for (arch = all_architectures; arch->name != NULL; arch++)
7808     {
7809       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7810         {
7811           unsigned long isa_temp = arch->flags;
7812
7813           if (ext != NULL)
7814             {
7815               /* TO_PARSE string contains at least one extension.  */
7816               enum aarch64_parse_opt_result ext_res
7817                 = aarch64_parse_extension (ext, &isa_temp);
7818
7819               if (ext_res != AARCH64_PARSE_OK)
7820                 return ext_res;
7821             }
7822           /* Extension parsing was successful.  Confirm the result
7823              arch and ISA flags.  */
7824           *res = arch;
7825           *isa_flags = isa_temp;
7826           return AARCH64_PARSE_OK;
7827         }
7828     }
7829
7830   /* ARCH name not found in list.  */
7831   return AARCH64_PARSE_INVALID_ARG;
7832 }
7833
7834 /* Parse the TO_PARSE string and put the result tuning in RES and the
7835    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
7836    describing the parse result.  If there is an error parsing, RES and
7837    ISA_FLAGS are left unchanged.  */
7838
7839 static enum aarch64_parse_opt_result
7840 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7841                    unsigned long *isa_flags)
7842 {
7843   char *ext;
7844   const struct processor *cpu;
7845   char *str = (char *) alloca (strlen (to_parse) + 1);
7846   size_t len;
7847
7848   strcpy (str, to_parse);
7849
7850   ext = strchr (str, '+');
7851
7852   if (ext != NULL)
7853     len = ext - str;
7854   else
7855     len = strlen (str);
7856
7857   if (len == 0)
7858     return AARCH64_PARSE_MISSING_ARG;
7859
7860
7861   /* Loop through the list of supported CPUs to find a match.  */
7862   for (cpu = all_cores; cpu->name != NULL; cpu++)
7863     {
7864       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7865         {
7866           unsigned long isa_temp = cpu->flags;
7867
7868
7869           if (ext != NULL)
7870             {
7871               /* TO_PARSE string contains at least one extension.  */
7872               enum aarch64_parse_opt_result ext_res
7873                 = aarch64_parse_extension (ext, &isa_temp);
7874
7875               if (ext_res != AARCH64_PARSE_OK)
7876                 return ext_res;
7877             }
7878           /* Extension parsing was successfull.  Confirm the result
7879              cpu and ISA flags.  */
7880           *res = cpu;
7881           *isa_flags = isa_temp;
7882           return AARCH64_PARSE_OK;
7883         }
7884     }
7885
7886   /* CPU name not found in list.  */
7887   return AARCH64_PARSE_INVALID_ARG;
7888 }
7889
7890 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7891    Return an aarch64_parse_opt_result describing the parse result.
7892    If the parsing fails the RES does not change.  */
7893
7894 static enum aarch64_parse_opt_result
7895 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7896 {
7897   const struct processor *cpu;
7898   char *str = (char *) alloca (strlen (to_parse) + 1);
7899
7900   strcpy (str, to_parse);
7901
7902   /* Loop through the list of supported CPUs to find a match.  */
7903   for (cpu = all_cores; cpu->name != NULL; cpu++)
7904     {
7905       if (strcmp (cpu->name, str) == 0)
7906         {
7907           *res = cpu;
7908           return AARCH64_PARSE_OK;
7909         }
7910     }
7911
7912   /* CPU name not found in list.  */
7913   return AARCH64_PARSE_INVALID_ARG;
7914 }
7915
7916 /* Parse TOKEN, which has length LENGTH to see if it is an option
7917    described in FLAG.  If it is, return the index bit for that fusion type.
7918    If not, error (printing OPTION_NAME) and return zero.  */
7919
7920 static unsigned int
7921 aarch64_parse_one_option_token (const char *token,
7922                                 size_t length,
7923                                 const struct aarch64_flag_desc *flag,
7924                                 const char *option_name)
7925 {
7926   for (; flag->name != NULL; flag++)
7927     {
7928       if (length == strlen (flag->name)
7929           && !strncmp (flag->name, token, length))
7930         return flag->flag;
7931     }
7932
7933   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7934   return 0;
7935 }
7936
7937 /* Parse OPTION which is a comma-separated list of flags to enable.
7938    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7939    default state we inherit from the CPU tuning structures.  OPTION_NAME
7940    gives the top-level option we are parsing in the -moverride string,
7941    for use in error messages.  */
7942
7943 static unsigned int
7944 aarch64_parse_boolean_options (const char *option,
7945                                const struct aarch64_flag_desc *flags,
7946                                unsigned int initial_state,
7947                                const char *option_name)
7948 {
7949   const char separator = '.';
7950   const char* specs = option;
7951   const char* ntoken = option;
7952   unsigned int found_flags = initial_state;
7953
7954   while ((ntoken = strchr (specs, separator)))
7955     {
7956       size_t token_length = ntoken - specs;
7957       unsigned token_ops = aarch64_parse_one_option_token (specs,
7958                                                            token_length,
7959                                                            flags,
7960                                                            option_name);
7961       /* If we find "none" (or, for simplicity's sake, an error) anywhere
7962          in the token stream, reset the supported operations.  So:
7963
7964            adrp+add.cmp+branch.none.adrp+add
7965
7966            would have the result of turning on only adrp+add fusion.  */
7967       if (!token_ops)
7968         found_flags = 0;
7969
7970       found_flags |= token_ops;
7971       specs = ++ntoken;
7972     }
7973
7974   /* We ended with a comma, print something.  */
7975   if (!(*specs))
7976     {
7977       error ("%s string ill-formed\n", option_name);
7978       return 0;
7979     }
7980
7981   /* We still have one more token to parse.  */
7982   size_t token_length = strlen (specs);
7983   unsigned token_ops = aarch64_parse_one_option_token (specs,
7984                                                        token_length,
7985                                                        flags,
7986                                                        option_name);
7987    if (!token_ops)
7988      found_flags = 0;
7989
7990   found_flags |= token_ops;
7991   return found_flags;
7992 }
7993
7994 /* Support for overriding instruction fusion.  */
7995
7996 static void
7997 aarch64_parse_fuse_string (const char *fuse_string,
7998                             struct tune_params *tune)
7999 {
8000   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8001                                                      aarch64_fusible_pairs,
8002                                                      tune->fusible_ops,
8003                                                      "fuse=");
8004 }
8005
8006 /* Support for overriding other tuning flags.  */
8007
8008 static void
8009 aarch64_parse_tune_string (const char *tune_string,
8010                             struct tune_params *tune)
8011 {
8012   tune->extra_tuning_flags
8013     = aarch64_parse_boolean_options (tune_string,
8014                                      aarch64_tuning_flags,
8015                                      tune->extra_tuning_flags,
8016                                      "tune=");
8017 }
8018
8019 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8020    we understand.  If it is, extract the option string and handoff to
8021    the appropriate function.  */
8022
8023 void
8024 aarch64_parse_one_override_token (const char* token,
8025                                   size_t length,
8026                                   struct tune_params *tune)
8027 {
8028   const struct aarch64_tuning_override_function *fn
8029     = aarch64_tuning_override_functions;
8030
8031   const char *option_part = strchr (token, '=');
8032   if (!option_part)
8033     {
8034       error ("tuning string missing in option (%s)", token);
8035       return;
8036     }
8037
8038   /* Get the length of the option name.  */
8039   length = option_part - token;
8040   /* Skip the '=' to get to the option string.  */
8041   option_part++;
8042
8043   for (; fn->name != NULL; fn++)
8044     {
8045       if (!strncmp (fn->name, token, length))
8046         {
8047           fn->parse_override (option_part, tune);
8048           return;
8049         }
8050     }
8051
8052   error ("unknown tuning option (%s)",token);
8053   return;
8054 }
8055
8056 /* A checking mechanism for the implementation of the tls size.  */
8057
8058 static void
8059 initialize_aarch64_tls_size (struct gcc_options *opts)
8060 {
8061   if (aarch64_tls_size == 0)
8062     aarch64_tls_size = 24;
8063
8064   switch (opts->x_aarch64_cmodel_var)
8065     {
8066     case AARCH64_CMODEL_TINY:
8067       /* Both the default and maximum TLS size allowed under tiny is 1M which
8068          needs two instructions to address, so we clamp the size to 24.  */
8069       if (aarch64_tls_size > 24)
8070         aarch64_tls_size = 24;
8071       break;
8072     case AARCH64_CMODEL_SMALL:
8073       /* The maximum TLS size allowed under small is 4G.  */
8074       if (aarch64_tls_size > 32)
8075         aarch64_tls_size = 32;
8076       break;
8077     case AARCH64_CMODEL_LARGE:
8078       /* The maximum TLS size allowed under large is 16E.
8079          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8080       if (aarch64_tls_size > 48)
8081         aarch64_tls_size = 48;
8082       break;
8083     default:
8084       gcc_unreachable ();
8085     }
8086
8087   return;
8088 }
8089
8090 /* Parse STRING looking for options in the format:
8091      string     :: option:string
8092      option     :: name=substring
8093      name       :: {a-z}
8094      substring  :: defined by option.  */
8095
8096 static void
8097 aarch64_parse_override_string (const char* input_string,
8098                                struct tune_params* tune)
8099 {
8100   const char separator = ':';
8101   size_t string_length = strlen (input_string) + 1;
8102   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8103   char *string = string_root;
8104   strncpy (string, input_string, string_length);
8105   string[string_length - 1] = '\0';
8106
8107   char* ntoken = string;
8108
8109   while ((ntoken = strchr (string, separator)))
8110     {
8111       size_t token_length = ntoken - string;
8112       /* Make this substring look like a string.  */
8113       *ntoken = '\0';
8114       aarch64_parse_one_override_token (string, token_length, tune);
8115       string = ++ntoken;
8116     }
8117
8118   /* One last option to parse.  */
8119   aarch64_parse_one_override_token (string, strlen (string), tune);
8120   free (string_root);
8121 }
8122
8123
8124 static void
8125 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8126 {
8127   /* The logic here is that if we are disabling all frame pointer generation
8128      then we do not need to disable leaf frame pointer generation as a
8129      separate operation.  But if we are *only* disabling leaf frame pointer
8130      generation then we set flag_omit_frame_pointer to true, but in
8131      aarch64_frame_pointer_required we return false only for leaf functions.
8132
8133      PR 70044: We have to be careful about being called multiple times for the
8134      same function.  Once we have decided to set flag_omit_frame_pointer just
8135      so that we can omit leaf frame pointers, we must then not interpret a
8136      second call as meaning that all frame pointer generation should be
8137      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8138      non-zero value.  */
8139   if (opts->x_flag_omit_frame_pointer == 2)
8140     opts->x_flag_omit_frame_pointer = 0;
8141
8142   if (opts->x_flag_omit_frame_pointer)
8143     opts->x_flag_omit_leaf_frame_pointer = false;
8144   else if (opts->x_flag_omit_leaf_frame_pointer)
8145     opts->x_flag_omit_frame_pointer = 2;
8146
8147   /* If not optimizing for size, set the default
8148      alignment to what the target wants.  */
8149   if (!opts->x_optimize_size)
8150     {
8151       if (opts->x_align_loops <= 0)
8152         opts->x_align_loops = aarch64_tune_params.loop_align;
8153       if (opts->x_align_jumps <= 0)
8154         opts->x_align_jumps = aarch64_tune_params.jump_align;
8155       if (opts->x_align_functions <= 0)
8156         opts->x_align_functions = aarch64_tune_params.function_align;
8157     }
8158
8159   /* We default to no pc-relative literal loads.  */
8160
8161   aarch64_pcrelative_literal_loads = false;
8162
8163   /* If -mpc-relative-literal-loads is set on the command line, this
8164      implies that the user asked for PC relative literal loads.  */
8165   if (opts->x_pcrelative_literal_loads == 1)
8166     aarch64_pcrelative_literal_loads = true;
8167
8168   /* This is PR70113. When building the Linux kernel with
8169      CONFIG_ARM64_ERRATUM_843419, support for relocations
8170      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8171      removed from the kernel to avoid loading objects with possibly
8172      offending sequences.  Without -mpc-relative-literal-loads we would
8173      generate such relocations, preventing the kernel build from
8174      succeeding.  */
8175   if (opts->x_pcrelative_literal_loads == 2
8176       && TARGET_FIX_ERR_A53_843419)
8177     aarch64_pcrelative_literal_loads = true;
8178
8179   /* In the tiny memory model it makes no sense to disallow PC relative
8180      literal pool loads.  */
8181   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8182       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8183     aarch64_pcrelative_literal_loads = true;
8184
8185   /* When enabling the lower precision Newton series for the square root, also
8186      enable it for the reciprocal square root, since the latter is an
8187      intermediary step for the former.  */
8188   if (flag_mlow_precision_sqrt)
8189     flag_mrecip_low_precision_sqrt = true;
8190 }
8191
8192 /* 'Unpack' up the internal tuning structs and update the options
8193     in OPTS.  The caller must have set up selected_tune and selected_arch
8194     as all the other target-specific codegen decisions are
8195     derived from them.  */
8196
8197 void
8198 aarch64_override_options_internal (struct gcc_options *opts)
8199 {
8200   aarch64_tune_flags = selected_tune->flags;
8201   aarch64_tune = selected_tune->sched_core;
8202   /* Make a copy of the tuning parameters attached to the core, which
8203      we may later overwrite.  */
8204   aarch64_tune_params = *(selected_tune->tune);
8205   aarch64_architecture_version = selected_arch->architecture_version;
8206
8207   if (opts->x_aarch64_override_tune_string)
8208     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8209                                   &aarch64_tune_params);
8210
8211   /* This target defaults to strict volatile bitfields.  */
8212   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8213     opts->x_flag_strict_volatile_bitfields = 1;
8214
8215   initialize_aarch64_code_model (opts);
8216   initialize_aarch64_tls_size (opts);
8217
8218   int queue_depth = 0;
8219   switch (aarch64_tune_params.autoprefetcher_model)
8220     {
8221       case tune_params::AUTOPREFETCHER_OFF:
8222         queue_depth = -1;
8223         break;
8224       case tune_params::AUTOPREFETCHER_WEAK:
8225         queue_depth = 0;
8226         break;
8227       case tune_params::AUTOPREFETCHER_STRONG:
8228         queue_depth = max_insn_queue_index + 1;
8229         break;
8230       default:
8231         gcc_unreachable ();
8232     }
8233
8234   /* We don't mind passing in global_options_set here as we don't use
8235      the *options_set structs anyway.  */
8236   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8237                          queue_depth,
8238                          opts->x_param_values,
8239                          global_options_set.x_param_values);
8240
8241   /* Set the L1 cache line size.  */
8242   if (selected_cpu->tune->cache_line_size != 0)
8243     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8244                            selected_cpu->tune->cache_line_size,
8245                            opts->x_param_values,
8246                            global_options_set.x_param_values);
8247
8248   aarch64_override_options_after_change_1 (opts);
8249 }
8250
8251 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
8252    specified in STR and throw errors if appropriate.  Put the results if
8253    they are valid in RES and ISA_FLAGS.  Return whether the option is
8254    valid.  */
8255
8256 static bool
8257 aarch64_validate_mcpu (const char *str, const struct processor **res,
8258                        unsigned long *isa_flags)
8259 {
8260   enum aarch64_parse_opt_result parse_res
8261     = aarch64_parse_cpu (str, res, isa_flags);
8262
8263   if (parse_res == AARCH64_PARSE_OK)
8264     return true;
8265
8266   switch (parse_res)
8267     {
8268       case AARCH64_PARSE_MISSING_ARG:
8269         error ("missing cpu name in -mcpu=%qs", str);
8270         break;
8271       case AARCH64_PARSE_INVALID_ARG:
8272         error ("unknown value %qs for -mcpu", str);
8273         break;
8274       case AARCH64_PARSE_INVALID_FEATURE:
8275         error ("invalid feature modifier in -mcpu=%qs", str);
8276         break;
8277       default:
8278         gcc_unreachable ();
8279     }
8280
8281   return false;
8282 }
8283
8284 /* Validate a command-line -march option.  Parse the arch and extensions
8285    (if any) specified in STR and throw errors if appropriate.  Put the
8286    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
8287    option is valid.  */
8288
8289 static bool
8290 aarch64_validate_march (const char *str, const struct processor **res,
8291                        unsigned long *isa_flags)
8292 {
8293   enum aarch64_parse_opt_result parse_res
8294     = aarch64_parse_arch (str, res, isa_flags);
8295
8296   if (parse_res == AARCH64_PARSE_OK)
8297     return true;
8298
8299   switch (parse_res)
8300     {
8301       case AARCH64_PARSE_MISSING_ARG:
8302         error ("missing arch name in -march=%qs", str);
8303         break;
8304       case AARCH64_PARSE_INVALID_ARG:
8305         error ("unknown value %qs for -march", str);
8306         break;
8307       case AARCH64_PARSE_INVALID_FEATURE:
8308         error ("invalid feature modifier in -march=%qs", str);
8309         break;
8310       default:
8311         gcc_unreachable ();
8312     }
8313
8314   return false;
8315 }
8316
8317 /* Validate a command-line -mtune option.  Parse the cpu
8318    specified in STR and throw errors if appropriate.  Put the
8319    result, if it is valid, in RES.  Return whether the option is
8320    valid.  */
8321
8322 static bool
8323 aarch64_validate_mtune (const char *str, const struct processor **res)
8324 {
8325   enum aarch64_parse_opt_result parse_res
8326     = aarch64_parse_tune (str, res);
8327
8328   if (parse_res == AARCH64_PARSE_OK)
8329     return true;
8330
8331   switch (parse_res)
8332     {
8333       case AARCH64_PARSE_MISSING_ARG:
8334         error ("missing cpu name in -mtune=%qs", str);
8335         break;
8336       case AARCH64_PARSE_INVALID_ARG:
8337         error ("unknown value %qs for -mtune", str);
8338         break;
8339       default:
8340         gcc_unreachable ();
8341     }
8342   return false;
8343 }
8344
8345 /* Return the CPU corresponding to the enum CPU.
8346    If it doesn't specify a cpu, return the default.  */
8347
8348 static const struct processor *
8349 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8350 {
8351   if (cpu != aarch64_none)
8352     return &all_cores[cpu];
8353
8354   /* The & 0x3f is to extract the bottom 6 bits that encode the
8355      default cpu as selected by the --with-cpu GCC configure option
8356      in config.gcc.
8357      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8358      flags mechanism should be reworked to make it more sane.  */
8359   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8360 }
8361
8362 /* Return the architecture corresponding to the enum ARCH.
8363    If it doesn't specify a valid architecture, return the default.  */
8364
8365 static const struct processor *
8366 aarch64_get_arch (enum aarch64_arch arch)
8367 {
8368   if (arch != aarch64_no_arch)
8369     return &all_architectures[arch];
8370
8371   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8372
8373   return &all_architectures[cpu->arch];
8374 }
8375
8376 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
8377    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8378    tuning structs.  In particular it must set selected_tune and
8379    aarch64_isa_flags that define the available ISA features and tuning
8380    decisions.  It must also set selected_arch as this will be used to
8381    output the .arch asm tags for each function.  */
8382
8383 static void
8384 aarch64_override_options (void)
8385 {
8386   unsigned long cpu_isa = 0;
8387   unsigned long arch_isa = 0;
8388   aarch64_isa_flags = 0;
8389
8390   bool valid_cpu = true;
8391   bool valid_tune = true;
8392   bool valid_arch = true;
8393
8394   selected_cpu = NULL;
8395   selected_arch = NULL;
8396   selected_tune = NULL;
8397
8398   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8399      If either of -march or -mtune is given, they override their
8400      respective component of -mcpu.  */
8401   if (aarch64_cpu_string)
8402     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8403                                         &cpu_isa);
8404
8405   if (aarch64_arch_string)
8406     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8407                                           &arch_isa);
8408
8409   if (aarch64_tune_string)
8410     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8411
8412   /* If the user did not specify a processor, choose the default
8413      one for them.  This will be the CPU set during configuration using
8414      --with-cpu, otherwise it is "generic".  */
8415   if (!selected_cpu)
8416     {
8417       if (selected_arch)
8418         {
8419           selected_cpu = &all_cores[selected_arch->ident];
8420           aarch64_isa_flags = arch_isa;
8421           explicit_arch = selected_arch->arch;
8422         }
8423       else
8424         {
8425           /* Get default configure-time CPU.  */
8426           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8427           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8428         }
8429
8430       if (selected_tune)
8431         explicit_tune_core = selected_tune->ident;
8432     }
8433   /* If both -mcpu and -march are specified check that they are architecturally
8434      compatible, warn if they're not and prefer the -march ISA flags.  */
8435   else if (selected_arch)
8436     {
8437       if (selected_arch->arch != selected_cpu->arch)
8438         {
8439           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8440                        all_architectures[selected_cpu->arch].name,
8441                        selected_arch->name);
8442         }
8443       aarch64_isa_flags = arch_isa;
8444       explicit_arch = selected_arch->arch;
8445       explicit_tune_core = selected_tune ? selected_tune->ident
8446                                           : selected_cpu->ident;
8447     }
8448   else
8449     {
8450       /* -mcpu but no -march.  */
8451       aarch64_isa_flags = cpu_isa;
8452       explicit_tune_core = selected_tune ? selected_tune->ident
8453                                           : selected_cpu->ident;
8454       gcc_assert (selected_cpu);
8455       selected_arch = &all_architectures[selected_cpu->arch];
8456       explicit_arch = selected_arch->arch;
8457     }
8458
8459   /* Set the arch as well as we will need it when outputing
8460      the .arch directive in assembly.  */
8461   if (!selected_arch)
8462     {
8463       gcc_assert (selected_cpu);
8464       selected_arch = &all_architectures[selected_cpu->arch];
8465     }
8466
8467   if (!selected_tune)
8468     selected_tune = selected_cpu;
8469
8470 #ifndef HAVE_AS_MABI_OPTION
8471   /* The compiler may have been configured with 2.23.* binutils, which does
8472      not have support for ILP32.  */
8473   if (TARGET_ILP32)
8474     error ("Assembler does not support -mabi=ilp32");
8475 #endif
8476
8477   /* Make sure we properly set up the explicit options.  */
8478   if ((aarch64_cpu_string && valid_cpu)
8479        || (aarch64_tune_string && valid_tune))
8480     gcc_assert (explicit_tune_core != aarch64_none);
8481
8482   if ((aarch64_cpu_string && valid_cpu)
8483        || (aarch64_arch_string && valid_arch))
8484     gcc_assert (explicit_arch != aarch64_no_arch);
8485
8486   aarch64_override_options_internal (&global_options);
8487
8488   /* Save these options as the default ones in case we push and pop them later
8489      while processing functions with potential target attributes.  */
8490   target_option_default_node = target_option_current_node
8491       = build_target_option_node (&global_options);
8492
8493   aarch64_register_fma_steering ();
8494
8495 }
8496
8497 /* Implement targetm.override_options_after_change.  */
8498
8499 static void
8500 aarch64_override_options_after_change (void)
8501 {
8502   aarch64_override_options_after_change_1 (&global_options);
8503 }
8504
8505 static struct machine_function *
8506 aarch64_init_machine_status (void)
8507 {
8508   struct machine_function *machine;
8509   machine = ggc_cleared_alloc<machine_function> ();
8510   return machine;
8511 }
8512
8513 void
8514 aarch64_init_expanders (void)
8515 {
8516   init_machine_status = aarch64_init_machine_status;
8517 }
8518
8519 /* A checking mechanism for the implementation of the various code models.  */
8520 static void
8521 initialize_aarch64_code_model (struct gcc_options *opts)
8522 {
8523    if (opts->x_flag_pic)
8524      {
8525        switch (opts->x_aarch64_cmodel_var)
8526          {
8527          case AARCH64_CMODEL_TINY:
8528            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8529            break;
8530          case AARCH64_CMODEL_SMALL:
8531 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8532            aarch64_cmodel = (flag_pic == 2
8533                              ? AARCH64_CMODEL_SMALL_PIC
8534                              : AARCH64_CMODEL_SMALL_SPIC);
8535 #else
8536            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8537 #endif
8538            break;
8539          case AARCH64_CMODEL_LARGE:
8540            sorry ("code model %qs with -f%s", "large",
8541                   opts->x_flag_pic > 1 ? "PIC" : "pic");
8542            break;
8543          default:
8544            gcc_unreachable ();
8545          }
8546      }
8547    else
8548      aarch64_cmodel = opts->x_aarch64_cmodel_var;
8549 }
8550
8551 /* Implement TARGET_OPTION_SAVE.  */
8552
8553 static void
8554 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8555 {
8556   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8557 }
8558
8559 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
8560    using the information saved in PTR.  */
8561
8562 static void
8563 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8564 {
8565   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8566   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8567   opts->x_explicit_arch = ptr->x_explicit_arch;
8568   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8569   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8570
8571   aarch64_override_options_internal (opts);
8572 }
8573
8574 /* Implement TARGET_OPTION_PRINT.  */
8575
8576 static void
8577 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8578 {
8579   const struct processor *cpu
8580     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8581   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8582   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8583   std::string extension
8584     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
8585
8586   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8587   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8588            arch->name, extension.c_str ());
8589 }
8590
8591 static GTY(()) tree aarch64_previous_fndecl;
8592
8593 void
8594 aarch64_reset_previous_fndecl (void)
8595 {
8596   aarch64_previous_fndecl = NULL;
8597 }
8598
8599 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
8600    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
8601    make sure optab availability predicates are recomputed when necessary.  */
8602
8603 void
8604 aarch64_save_restore_target_globals (tree new_tree)
8605 {
8606   if (TREE_TARGET_GLOBALS (new_tree))
8607     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8608   else if (new_tree == target_option_default_node)
8609     restore_target_globals (&default_target_globals);
8610   else
8611     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
8612 }
8613
8614 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
8615    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8616    of the function, if such exists.  This function may be called multiple
8617    times on a single function so use aarch64_previous_fndecl to avoid
8618    setting up identical state.  */
8619
8620 static void
8621 aarch64_set_current_function (tree fndecl)
8622 {
8623   if (!fndecl || fndecl == aarch64_previous_fndecl)
8624     return;
8625
8626   tree old_tree = (aarch64_previous_fndecl
8627                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
8628                    : NULL_TREE);
8629
8630   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
8631
8632   /* If current function has no attributes but the previous one did,
8633      use the default node.  */
8634   if (!new_tree && old_tree)
8635     new_tree = target_option_default_node;
8636
8637   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
8638      the default have been handled by aarch64_save_restore_target_globals from
8639      aarch64_pragma_target_parse.  */
8640   if (old_tree == new_tree)
8641     return;
8642
8643   aarch64_previous_fndecl = fndecl;
8644
8645   /* First set the target options.  */
8646   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
8647
8648   aarch64_save_restore_target_globals (new_tree);
8649 }
8650
8651 /* Enum describing the various ways we can handle attributes.
8652    In many cases we can reuse the generic option handling machinery.  */
8653
8654 enum aarch64_attr_opt_type
8655 {
8656   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
8657   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
8658   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
8659   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
8660 };
8661
8662 /* All the information needed to handle a target attribute.
8663    NAME is the name of the attribute.
8664    ATTR_TYPE specifies the type of behavior of the attribute as described
8665    in the definition of enum aarch64_attr_opt_type.
8666    ALLOW_NEG is true if the attribute supports a "no-" form.
8667    HANDLER is the function that takes the attribute string and whether
8668    it is a pragma or attribute and handles the option.  It is needed only
8669    when the ATTR_TYPE is aarch64_attr_custom.
8670    OPT_NUM is the enum specifying the option that the attribute modifies.
8671    This is needed for attributes that mirror the behavior of a command-line
8672    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8673    aarch64_attr_enum.  */
8674
8675 struct aarch64_attribute_info
8676 {
8677   const char *name;
8678   enum aarch64_attr_opt_type attr_type;
8679   bool allow_neg;
8680   bool (*handler) (const char *, const char *);
8681   enum opt_code opt_num;
8682 };
8683
8684 /* Handle the ARCH_STR argument to the arch= target attribute.
8685    PRAGMA_OR_ATTR is used in potential error messages.  */
8686
8687 static bool
8688 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8689 {
8690   const struct processor *tmp_arch = NULL;
8691   enum aarch64_parse_opt_result parse_res
8692     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8693
8694   if (parse_res == AARCH64_PARSE_OK)
8695     {
8696       gcc_assert (tmp_arch);
8697       selected_arch = tmp_arch;
8698       explicit_arch = selected_arch->arch;
8699       return true;
8700     }
8701
8702   switch (parse_res)
8703     {
8704       case AARCH64_PARSE_MISSING_ARG:
8705         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8706         break;
8707       case AARCH64_PARSE_INVALID_ARG:
8708         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8709         break;
8710       case AARCH64_PARSE_INVALID_FEATURE:
8711         error ("invalid feature modifier %qs for 'arch' target %s",
8712                str, pragma_or_attr);
8713         break;
8714       default:
8715         gcc_unreachable ();
8716     }
8717
8718   return false;
8719 }
8720
8721 /* Handle the argument CPU_STR to the cpu= target attribute.
8722    PRAGMA_OR_ATTR is used in potential error messages.  */
8723
8724 static bool
8725 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8726 {
8727   const struct processor *tmp_cpu = NULL;
8728   enum aarch64_parse_opt_result parse_res
8729     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8730
8731   if (parse_res == AARCH64_PARSE_OK)
8732     {
8733       gcc_assert (tmp_cpu);
8734       selected_tune = tmp_cpu;
8735       explicit_tune_core = selected_tune->ident;
8736
8737       selected_arch = &all_architectures[tmp_cpu->arch];
8738       explicit_arch = selected_arch->arch;
8739       return true;
8740     }
8741
8742   switch (parse_res)
8743     {
8744       case AARCH64_PARSE_MISSING_ARG:
8745         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8746         break;
8747       case AARCH64_PARSE_INVALID_ARG:
8748         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8749         break;
8750       case AARCH64_PARSE_INVALID_FEATURE:
8751         error ("invalid feature modifier %qs for 'cpu' target %s",
8752                str, pragma_or_attr);
8753         break;
8754       default:
8755         gcc_unreachable ();
8756     }
8757
8758   return false;
8759 }
8760
8761 /* Handle the argument STR to the tune= target attribute.
8762    PRAGMA_OR_ATTR is used in potential error messages.  */
8763
8764 static bool
8765 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8766 {
8767   const struct processor *tmp_tune = NULL;
8768   enum aarch64_parse_opt_result parse_res
8769     = aarch64_parse_tune (str, &tmp_tune);
8770
8771   if (parse_res == AARCH64_PARSE_OK)
8772     {
8773       gcc_assert (tmp_tune);
8774       selected_tune = tmp_tune;
8775       explicit_tune_core = selected_tune->ident;
8776       return true;
8777     }
8778
8779   switch (parse_res)
8780     {
8781       case AARCH64_PARSE_INVALID_ARG:
8782         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8783         break;
8784       default:
8785         gcc_unreachable ();
8786     }
8787
8788   return false;
8789 }
8790
8791 /* Parse an architecture extensions target attribute string specified in STR.
8792    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
8793    if successful.  Update aarch64_isa_flags to reflect the ISA features
8794    modified.
8795    PRAGMA_OR_ATTR is used in potential error messages.  */
8796
8797 static bool
8798 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8799 {
8800   enum aarch64_parse_opt_result parse_res;
8801   unsigned long isa_flags = aarch64_isa_flags;
8802
8803   /* We allow "+nothing" in the beginning to clear out all architectural
8804      features if the user wants to handpick specific features.  */
8805   if (strncmp ("+nothing", str, 8) == 0)
8806     {
8807       isa_flags = 0;
8808       str += 8;
8809     }
8810
8811   parse_res = aarch64_parse_extension (str, &isa_flags);
8812
8813   if (parse_res == AARCH64_PARSE_OK)
8814     {
8815       aarch64_isa_flags = isa_flags;
8816       return true;
8817     }
8818
8819   switch (parse_res)
8820     {
8821       case AARCH64_PARSE_MISSING_ARG:
8822         error ("missing feature modifier in target %s %qs",
8823                pragma_or_attr, str);
8824         break;
8825
8826       case AARCH64_PARSE_INVALID_FEATURE:
8827         error ("invalid feature modifier in target %s %qs",
8828                pragma_or_attr, str);
8829         break;
8830
8831       default:
8832         gcc_unreachable ();
8833     }
8834
8835  return false;
8836 }
8837
8838 /* The target attributes that we support.  On top of these we also support just
8839    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
8840    handled explicitly in aarch64_process_one_target_attr.  */
8841
8842 static const struct aarch64_attribute_info aarch64_attributes[] =
8843 {
8844   { "general-regs-only", aarch64_attr_mask, false, NULL,
8845      OPT_mgeneral_regs_only },
8846   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8847      OPT_mfix_cortex_a53_835769 },
8848   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
8849      OPT_mfix_cortex_a53_843419 },
8850   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8851   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8852   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8853      OPT_momit_leaf_frame_pointer },
8854   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8855   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
8856      OPT_march_ },
8857   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
8858   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
8859      OPT_mtune_ },
8860   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
8861 };
8862
8863 /* Parse ARG_STR which contains the definition of one target attribute.
8864    Show appropriate errors if any or return true if the attribute is valid.
8865    PRAGMA_OR_ATTR holds the string to use in error messages about whether
8866    we're processing a target attribute or pragma.  */
8867
8868 static bool
8869 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
8870 {
8871   bool invert = false;
8872
8873   size_t len = strlen (arg_str);
8874
8875   if (len == 0)
8876     {
8877       error ("malformed target %s", pragma_or_attr);
8878       return false;
8879     }
8880
8881   char *str_to_check = (char *) alloca (len + 1);
8882   strcpy (str_to_check, arg_str);
8883
8884   /* Skip leading whitespace.  */
8885   while (*str_to_check == ' ' || *str_to_check == '\t')
8886     str_to_check++;
8887
8888   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
8889      It is easier to detect and handle it explicitly here rather than going
8890      through the machinery for the rest of the target attributes in this
8891      function.  */
8892   if (*str_to_check == '+')
8893     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
8894
8895   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
8896     {
8897       invert = true;
8898       str_to_check += 3;
8899     }
8900   char *arg = strchr (str_to_check, '=');
8901
8902   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
8903      and point ARG to "foo".  */
8904   if (arg)
8905     {
8906       *arg = '\0';
8907       arg++;
8908     }
8909   const struct aarch64_attribute_info *p_attr;
8910   bool found = false;
8911   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
8912     {
8913       /* If the names don't match up, or the user has given an argument
8914          to an attribute that doesn't accept one, or didn't give an argument
8915          to an attribute that expects one, fail to match.  */
8916       if (strcmp (str_to_check, p_attr->name) != 0)
8917         continue;
8918
8919       found = true;
8920       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
8921                               || p_attr->attr_type == aarch64_attr_enum;
8922
8923       if (attr_need_arg_p ^ (arg != NULL))
8924         {
8925           error ("target %s %qs does not accept an argument",
8926                   pragma_or_attr, str_to_check);
8927           return false;
8928         }
8929
8930       /* If the name matches but the attribute does not allow "no-" versions
8931          then we can't match.  */
8932       if (invert && !p_attr->allow_neg)
8933         {
8934           error ("target %s %qs does not allow a negated form",
8935                   pragma_or_attr, str_to_check);
8936           return false;
8937         }
8938
8939       switch (p_attr->attr_type)
8940         {
8941         /* Has a custom handler registered.
8942            For example, cpu=, arch=, tune=.  */
8943           case aarch64_attr_custom:
8944             gcc_assert (p_attr->handler);
8945             if (!p_attr->handler (arg, pragma_or_attr))
8946               return false;
8947             break;
8948
8949           /* Either set or unset a boolean option.  */
8950           case aarch64_attr_bool:
8951             {
8952               struct cl_decoded_option decoded;
8953
8954               generate_option (p_attr->opt_num, NULL, !invert,
8955                                CL_TARGET, &decoded);
8956               aarch64_handle_option (&global_options, &global_options_set,
8957                                       &decoded, input_location);
8958               break;
8959             }
8960           /* Set or unset a bit in the target_flags.  aarch64_handle_option
8961              should know what mask to apply given the option number.  */
8962           case aarch64_attr_mask:
8963             {
8964               struct cl_decoded_option decoded;
8965               /* We only need to specify the option number.
8966                  aarch64_handle_option will know which mask to apply.  */
8967               decoded.opt_index = p_attr->opt_num;
8968               decoded.value = !invert;
8969               aarch64_handle_option (&global_options, &global_options_set,
8970                                       &decoded, input_location);
8971               break;
8972             }
8973           /* Use the option setting machinery to set an option to an enum.  */
8974           case aarch64_attr_enum:
8975             {
8976               gcc_assert (arg);
8977               bool valid;
8978               int value;
8979               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
8980                                               &value, CL_TARGET);
8981               if (valid)
8982                 {
8983                   set_option (&global_options, NULL, p_attr->opt_num, value,
8984                               NULL, DK_UNSPECIFIED, input_location,
8985                               global_dc);
8986                 }
8987               else
8988                 {
8989                   error ("target %s %s=%s is not valid",
8990                          pragma_or_attr, str_to_check, arg);
8991                 }
8992               break;
8993             }
8994           default:
8995             gcc_unreachable ();
8996         }
8997     }
8998
8999   /* If we reached here we either have found an attribute and validated
9000      it or didn't match any.  If we matched an attribute but its arguments
9001      were malformed we will have returned false already.  */
9002   return found;
9003 }
9004
9005 /* Count how many times the character C appears in
9006    NULL-terminated string STR.  */
9007
9008 static unsigned int
9009 num_occurences_in_str (char c, char *str)
9010 {
9011   unsigned int res = 0;
9012   while (*str != '\0')
9013     {
9014       if (*str == c)
9015         res++;
9016
9017       str++;
9018     }
9019
9020   return res;
9021 }
9022
9023 /* Parse the tree in ARGS that contains the target attribute information
9024    and update the global target options space.  PRAGMA_OR_ATTR is a string
9025    to be used in error messages, specifying whether this is processing
9026    a target attribute or a target pragma.  */
9027
9028 bool
9029 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9030 {
9031   if (TREE_CODE (args) == TREE_LIST)
9032     {
9033       do
9034         {
9035           tree head = TREE_VALUE (args);
9036           if (head)
9037             {
9038               if (!aarch64_process_target_attr (head, pragma_or_attr))
9039                 return false;
9040             }
9041           args = TREE_CHAIN (args);
9042         } while (args);
9043
9044       return true;
9045     }
9046   /* We expect to find a string to parse.  */
9047   gcc_assert (TREE_CODE (args) == STRING_CST);
9048
9049   size_t len = strlen (TREE_STRING_POINTER (args));
9050   char *str_to_check = (char *) alloca (len + 1);
9051   strcpy (str_to_check, TREE_STRING_POINTER (args));
9052
9053   if (len == 0)
9054     {
9055       error ("malformed target %s value", pragma_or_attr);
9056       return false;
9057     }
9058
9059   /* Used to catch empty spaces between commas i.e.
9060      attribute ((target ("attr1,,attr2"))).  */
9061   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9062
9063   /* Handle multiple target attributes separated by ','.  */
9064   char *token = strtok (str_to_check, ",");
9065
9066   unsigned int num_attrs = 0;
9067   while (token)
9068     {
9069       num_attrs++;
9070       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9071         {
9072           error ("target %s %qs is invalid", pragma_or_attr, token);
9073           return false;
9074         }
9075
9076       token = strtok (NULL, ",");
9077     }
9078
9079   if (num_attrs != num_commas + 1)
9080     {
9081       error ("malformed target %s list %qs",
9082               pragma_or_attr, TREE_STRING_POINTER (args));
9083       return false;
9084     }
9085
9086   return true;
9087 }
9088
9089 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9090    process attribute ((target ("..."))).  */
9091
9092 static bool
9093 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9094 {
9095   struct cl_target_option cur_target;
9096   bool ret;
9097   tree old_optimize;
9098   tree new_target, new_optimize;
9099   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9100
9101   /* If what we're processing is the current pragma string then the
9102      target option node is already stored in target_option_current_node
9103      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9104      having to re-parse the string.  This is especially useful to keep
9105      arm_neon.h compile times down since that header contains a lot
9106      of intrinsics enclosed in pragmas.  */
9107   if (!existing_target && args == current_target_pragma)
9108     {
9109       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9110       return true;
9111     }
9112   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9113
9114   old_optimize = build_optimization_node (&global_options);
9115   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9116
9117   /* If the function changed the optimization levels as well as setting
9118      target options, start with the optimizations specified.  */
9119   if (func_optimize && func_optimize != old_optimize)
9120     cl_optimization_restore (&global_options,
9121                              TREE_OPTIMIZATION (func_optimize));
9122
9123   /* Save the current target options to restore at the end.  */
9124   cl_target_option_save (&cur_target, &global_options);
9125
9126   /* If fndecl already has some target attributes applied to it, unpack
9127      them so that we add this attribute on top of them, rather than
9128      overwriting them.  */
9129   if (existing_target)
9130     {
9131       struct cl_target_option *existing_options
9132         = TREE_TARGET_OPTION (existing_target);
9133
9134       if (existing_options)
9135         cl_target_option_restore (&global_options, existing_options);
9136     }
9137   else
9138     cl_target_option_restore (&global_options,
9139                         TREE_TARGET_OPTION (target_option_current_node));
9140
9141
9142   ret = aarch64_process_target_attr (args, "attribute");
9143
9144   /* Set up any additional state.  */
9145   if (ret)
9146     {
9147       aarch64_override_options_internal (&global_options);
9148       /* Initialize SIMD builtins if we haven't already.
9149          Set current_target_pragma to NULL for the duration so that
9150          the builtin initialization code doesn't try to tag the functions
9151          being built with the attributes specified by any current pragma, thus
9152          going into an infinite recursion.  */
9153       if (TARGET_SIMD)
9154         {
9155           tree saved_current_target_pragma = current_target_pragma;
9156           current_target_pragma = NULL;
9157           aarch64_init_simd_builtins ();
9158           current_target_pragma = saved_current_target_pragma;
9159         }
9160       new_target = build_target_option_node (&global_options);
9161     }
9162   else
9163     new_target = NULL;
9164
9165   new_optimize = build_optimization_node (&global_options);
9166
9167   if (fndecl && ret)
9168     {
9169       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9170
9171       if (old_optimize != new_optimize)
9172         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9173     }
9174
9175   cl_target_option_restore (&global_options, &cur_target);
9176
9177   if (old_optimize != new_optimize)
9178     cl_optimization_restore (&global_options,
9179                              TREE_OPTIMIZATION (old_optimize));
9180   return ret;
9181 }
9182
9183 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
9184    tri-bool options (yes, no, don't care) and the default value is
9185    DEF, determine whether to reject inlining.  */
9186
9187 static bool
9188 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9189                                      int dont_care, int def)
9190 {
9191   /* If the callee doesn't care, always allow inlining.  */
9192   if (callee == dont_care)
9193     return true;
9194
9195   /* If the caller doesn't care, always allow inlining.  */
9196   if (caller == dont_care)
9197     return true;
9198
9199   /* Otherwise, allow inlining if either the callee and caller values
9200      agree, or if the callee is using the default value.  */
9201   return (callee == caller || callee == def);
9202 }
9203
9204 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
9205    to inline CALLEE into CALLER based on target-specific info.
9206    Make sure that the caller and callee have compatible architectural
9207    features.  Then go through the other possible target attributes
9208    and see if they can block inlining.  Try not to reject always_inline
9209    callees unless they are incompatible architecturally.  */
9210
9211 static bool
9212 aarch64_can_inline_p (tree caller, tree callee)
9213 {
9214   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9215   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9216
9217   /* If callee has no option attributes, then it is ok to inline.  */
9218   if (!callee_tree)
9219     return true;
9220
9221   struct cl_target_option *caller_opts
9222         = TREE_TARGET_OPTION (caller_tree ? caller_tree
9223                                            : target_option_default_node);
9224
9225   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9226
9227
9228   /* Callee's ISA flags should be a subset of the caller's.  */
9229   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9230        != callee_opts->x_aarch64_isa_flags)
9231     return false;
9232
9233   /* Allow non-strict aligned functions inlining into strict
9234      aligned ones.  */
9235   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9236        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9237       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9238            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9239     return false;
9240
9241   bool always_inline = lookup_attribute ("always_inline",
9242                                           DECL_ATTRIBUTES (callee));
9243
9244   /* If the architectural features match up and the callee is always_inline
9245      then the other attributes don't matter.  */
9246   if (always_inline)
9247     return true;
9248
9249   if (caller_opts->x_aarch64_cmodel_var
9250       != callee_opts->x_aarch64_cmodel_var)
9251     return false;
9252
9253   if (caller_opts->x_aarch64_tls_dialect
9254       != callee_opts->x_aarch64_tls_dialect)
9255     return false;
9256
9257   /* Honour explicit requests to workaround errata.  */
9258   if (!aarch64_tribools_ok_for_inlining_p (
9259           caller_opts->x_aarch64_fix_a53_err835769,
9260           callee_opts->x_aarch64_fix_a53_err835769,
9261           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9262     return false;
9263
9264   if (!aarch64_tribools_ok_for_inlining_p (
9265           caller_opts->x_aarch64_fix_a53_err843419,
9266           callee_opts->x_aarch64_fix_a53_err843419,
9267           2, TARGET_FIX_ERR_A53_843419))
9268     return false;
9269
9270   /* If the user explicitly specified -momit-leaf-frame-pointer for the
9271      caller and calle and they don't match up, reject inlining.  */
9272   if (!aarch64_tribools_ok_for_inlining_p (
9273           caller_opts->x_flag_omit_leaf_frame_pointer,
9274           callee_opts->x_flag_omit_leaf_frame_pointer,
9275           2, 1))
9276     return false;
9277
9278   /* If the callee has specific tuning overrides, respect them.  */
9279   if (callee_opts->x_aarch64_override_tune_string != NULL
9280       && caller_opts->x_aarch64_override_tune_string == NULL)
9281     return false;
9282
9283   /* If the user specified tuning override strings for the
9284      caller and callee and they don't match up, reject inlining.
9285      We just do a string compare here, we don't analyze the meaning
9286      of the string, as it would be too costly for little gain.  */
9287   if (callee_opts->x_aarch64_override_tune_string
9288       && caller_opts->x_aarch64_override_tune_string
9289       && (strcmp (callee_opts->x_aarch64_override_tune_string,
9290                   caller_opts->x_aarch64_override_tune_string) != 0))
9291     return false;
9292
9293   return true;
9294 }
9295
9296 /* Return true if SYMBOL_REF X binds locally.  */
9297
9298 static bool
9299 aarch64_symbol_binds_local_p (const_rtx x)
9300 {
9301   return (SYMBOL_REF_DECL (x)
9302           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9303           : SYMBOL_REF_LOCAL_P (x));
9304 }
9305
9306 /* Return true if SYMBOL_REF X is thread local */
9307 static bool
9308 aarch64_tls_symbol_p (rtx x)
9309 {
9310   if (! TARGET_HAVE_TLS)
9311     return false;
9312
9313   if (GET_CODE (x) != SYMBOL_REF)
9314     return false;
9315
9316   return SYMBOL_REF_TLS_MODEL (x) != 0;
9317 }
9318
9319 /* Classify a TLS symbol into one of the TLS kinds.  */
9320 enum aarch64_symbol_type
9321 aarch64_classify_tls_symbol (rtx x)
9322 {
9323   enum tls_model tls_kind = tls_symbolic_operand_type (x);
9324
9325   switch (tls_kind)
9326     {
9327     case TLS_MODEL_GLOBAL_DYNAMIC:
9328     case TLS_MODEL_LOCAL_DYNAMIC:
9329       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9330
9331     case TLS_MODEL_INITIAL_EXEC:
9332       switch (aarch64_cmodel)
9333         {
9334         case AARCH64_CMODEL_TINY:
9335         case AARCH64_CMODEL_TINY_PIC:
9336           return SYMBOL_TINY_TLSIE;
9337         default:
9338           return SYMBOL_SMALL_TLSIE;
9339         }
9340
9341     case TLS_MODEL_LOCAL_EXEC:
9342       if (aarch64_tls_size == 12)
9343         return SYMBOL_TLSLE12;
9344       else if (aarch64_tls_size == 24)
9345         return SYMBOL_TLSLE24;
9346       else if (aarch64_tls_size == 32)
9347         return SYMBOL_TLSLE32;
9348       else if (aarch64_tls_size == 48)
9349         return SYMBOL_TLSLE48;
9350       else
9351         gcc_unreachable ();
9352
9353     case TLS_MODEL_EMULATED:
9354     case TLS_MODEL_NONE:
9355       return SYMBOL_FORCE_TO_MEM;
9356
9357     default:
9358       gcc_unreachable ();
9359     }
9360 }
9361
9362 /* Return the method that should be used to access SYMBOL_REF or
9363    LABEL_REF X.  */
9364
9365 enum aarch64_symbol_type
9366 aarch64_classify_symbol (rtx x, rtx offset)
9367 {
9368   if (GET_CODE (x) == LABEL_REF)
9369     {
9370       switch (aarch64_cmodel)
9371         {
9372         case AARCH64_CMODEL_LARGE:
9373           return SYMBOL_FORCE_TO_MEM;
9374
9375         case AARCH64_CMODEL_TINY_PIC:
9376         case AARCH64_CMODEL_TINY:
9377           return SYMBOL_TINY_ABSOLUTE;
9378
9379         case AARCH64_CMODEL_SMALL_SPIC:
9380         case AARCH64_CMODEL_SMALL_PIC:
9381         case AARCH64_CMODEL_SMALL:
9382           return SYMBOL_SMALL_ABSOLUTE;
9383
9384         default:
9385           gcc_unreachable ();
9386         }
9387     }
9388
9389   if (GET_CODE (x) == SYMBOL_REF)
9390     {
9391       if (aarch64_tls_symbol_p (x))
9392         return aarch64_classify_tls_symbol (x);
9393
9394       switch (aarch64_cmodel)
9395         {
9396         case AARCH64_CMODEL_TINY:
9397           /* When we retrieve symbol + offset address, we have to make sure
9398              the offset does not cause overflow of the final address.  But
9399              we have no way of knowing the address of symbol at compile time
9400              so we can't accurately say if the distance between the PC and
9401              symbol + offset is outside the addressible range of +/-1M in the
9402              TINY code model.  So we rely on images not being greater than
9403              1M and cap the offset at 1M and anything beyond 1M will have to
9404              be loaded using an alternative mechanism.  Furthermore if the
9405              symbol is a weak reference to something that isn't known to
9406              resolve to a symbol in this module, then force to memory.  */
9407           if ((SYMBOL_REF_WEAK (x)
9408                && !aarch64_symbol_binds_local_p (x))
9409               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9410             return SYMBOL_FORCE_TO_MEM;
9411           return SYMBOL_TINY_ABSOLUTE;
9412
9413         case AARCH64_CMODEL_SMALL:
9414           /* Same reasoning as the tiny code model, but the offset cap here is
9415              4G.  */
9416           if ((SYMBOL_REF_WEAK (x)
9417                && !aarch64_symbol_binds_local_p (x))
9418               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9419                             HOST_WIDE_INT_C (4294967264)))
9420             return SYMBOL_FORCE_TO_MEM;
9421           return SYMBOL_SMALL_ABSOLUTE;
9422
9423         case AARCH64_CMODEL_TINY_PIC:
9424           if (!aarch64_symbol_binds_local_p (x))
9425             return SYMBOL_TINY_GOT;
9426           return SYMBOL_TINY_ABSOLUTE;
9427
9428         case AARCH64_CMODEL_SMALL_SPIC:
9429         case AARCH64_CMODEL_SMALL_PIC:
9430           if (!aarch64_symbol_binds_local_p (x))
9431             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9432                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9433           return SYMBOL_SMALL_ABSOLUTE;
9434
9435         case AARCH64_CMODEL_LARGE:
9436           /* This is alright even in PIC code as the constant
9437              pool reference is always PC relative and within
9438              the same translation unit.  */
9439           if (CONSTANT_POOL_ADDRESS_P (x))
9440             return SYMBOL_SMALL_ABSOLUTE;
9441           else
9442             return SYMBOL_FORCE_TO_MEM;
9443
9444         default:
9445           gcc_unreachable ();
9446         }
9447     }
9448
9449   /* By default push everything into the constant pool.  */
9450   return SYMBOL_FORCE_TO_MEM;
9451 }
9452
9453 bool
9454 aarch64_constant_address_p (rtx x)
9455 {
9456   return (CONSTANT_P (x) && memory_address_p (DImode, x));
9457 }
9458
9459 bool
9460 aarch64_legitimate_pic_operand_p (rtx x)
9461 {
9462   if (GET_CODE (x) == SYMBOL_REF
9463       || (GET_CODE (x) == CONST
9464           && GET_CODE (XEXP (x, 0)) == PLUS
9465           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9466      return false;
9467
9468   return true;
9469 }
9470
9471 /* Return true if X holds either a quarter-precision or
9472      floating-point +0.0 constant.  */
9473 static bool
9474 aarch64_valid_floating_const (machine_mode mode, rtx x)
9475 {
9476   if (!CONST_DOUBLE_P (x))
9477     return false;
9478
9479   if (aarch64_float_const_zero_rtx_p (x))
9480     return true;
9481
9482   /* We only handle moving 0.0 to a TFmode register.  */
9483   if (!(mode == SFmode || mode == DFmode))
9484     return false;
9485
9486   return aarch64_float_const_representable_p (x);
9487 }
9488
9489 static bool
9490 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9491 {
9492   /* Do not allow vector struct mode constants.  We could support
9493      0 and -1 easily, but they need support in aarch64-simd.md.  */
9494   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9495     return false;
9496
9497   /* This could probably go away because
9498      we now decompose CONST_INTs according to expand_mov_immediate.  */
9499   if ((GET_CODE (x) == CONST_VECTOR
9500        && aarch64_simd_valid_immediate (x, mode, false, NULL))
9501       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9502         return !targetm.cannot_force_const_mem (mode, x);
9503
9504   if (GET_CODE (x) == HIGH
9505       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9506     return true;
9507
9508   return aarch64_constant_address_p (x);
9509 }
9510
9511 rtx
9512 aarch64_load_tp (rtx target)
9513 {
9514   if (!target
9515       || GET_MODE (target) != Pmode
9516       || !register_operand (target, Pmode))
9517     target = gen_reg_rtx (Pmode);
9518
9519   /* Can return in any reg.  */
9520   emit_insn (gen_aarch64_load_tp_hard (target));
9521   return target;
9522 }
9523
9524 /* On AAPCS systems, this is the "struct __va_list".  */
9525 static GTY(()) tree va_list_type;
9526
9527 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9528    Return the type to use as __builtin_va_list.
9529
9530    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9531
9532    struct __va_list
9533    {
9534      void *__stack;
9535      void *__gr_top;
9536      void *__vr_top;
9537      int   __gr_offs;
9538      int   __vr_offs;
9539    };  */
9540
9541 static tree
9542 aarch64_build_builtin_va_list (void)
9543 {
9544   tree va_list_name;
9545   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9546
9547   /* Create the type.  */
9548   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9549   /* Give it the required name.  */
9550   va_list_name = build_decl (BUILTINS_LOCATION,
9551                              TYPE_DECL,
9552                              get_identifier ("__va_list"),
9553                              va_list_type);
9554   DECL_ARTIFICIAL (va_list_name) = 1;
9555   TYPE_NAME (va_list_type) = va_list_name;
9556   TYPE_STUB_DECL (va_list_type) = va_list_name;
9557
9558   /* Create the fields.  */
9559   f_stack = build_decl (BUILTINS_LOCATION,
9560                         FIELD_DECL, get_identifier ("__stack"),
9561                         ptr_type_node);
9562   f_grtop = build_decl (BUILTINS_LOCATION,
9563                         FIELD_DECL, get_identifier ("__gr_top"),
9564                         ptr_type_node);
9565   f_vrtop = build_decl (BUILTINS_LOCATION,
9566                         FIELD_DECL, get_identifier ("__vr_top"),
9567                         ptr_type_node);
9568   f_groff = build_decl (BUILTINS_LOCATION,
9569                         FIELD_DECL, get_identifier ("__gr_offs"),
9570                         integer_type_node);
9571   f_vroff = build_decl (BUILTINS_LOCATION,
9572                         FIELD_DECL, get_identifier ("__vr_offs"),
9573                         integer_type_node);
9574
9575   /* Tell tree-stdarg pass about our internal offset fields.
9576      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
9577      purpose to identify whether the code is updating va_list internal
9578      offset fields through irregular way.  */
9579   va_list_gpr_counter_field = f_groff;
9580   va_list_fpr_counter_field = f_vroff;
9581
9582   DECL_ARTIFICIAL (f_stack) = 1;
9583   DECL_ARTIFICIAL (f_grtop) = 1;
9584   DECL_ARTIFICIAL (f_vrtop) = 1;
9585   DECL_ARTIFICIAL (f_groff) = 1;
9586   DECL_ARTIFICIAL (f_vroff) = 1;
9587
9588   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9589   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9590   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9591   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9592   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9593
9594   TYPE_FIELDS (va_list_type) = f_stack;
9595   DECL_CHAIN (f_stack) = f_grtop;
9596   DECL_CHAIN (f_grtop) = f_vrtop;
9597   DECL_CHAIN (f_vrtop) = f_groff;
9598   DECL_CHAIN (f_groff) = f_vroff;
9599
9600   /* Compute its layout.  */
9601   layout_type (va_list_type);
9602
9603   return va_list_type;
9604 }
9605
9606 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
9607 static void
9608 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
9609 {
9610   const CUMULATIVE_ARGS *cum;
9611   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9612   tree stack, grtop, vrtop, groff, vroff;
9613   tree t;
9614   int gr_save_area_size = cfun->va_list_gpr_size;
9615   int vr_save_area_size = cfun->va_list_fpr_size;
9616   int vr_offset;
9617
9618   cum = &crtl->args.info;
9619   if (cfun->va_list_gpr_size)
9620     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
9621                              cfun->va_list_gpr_size);
9622   if (cfun->va_list_fpr_size)
9623     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
9624                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
9625
9626   if (!TARGET_FLOAT)
9627     {
9628       gcc_assert (cum->aapcs_nvrn == 0);
9629       vr_save_area_size = 0;
9630     }
9631
9632   f_stack = TYPE_FIELDS (va_list_type_node);
9633   f_grtop = DECL_CHAIN (f_stack);
9634   f_vrtop = DECL_CHAIN (f_grtop);
9635   f_groff = DECL_CHAIN (f_vrtop);
9636   f_vroff = DECL_CHAIN (f_groff);
9637
9638   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
9639                   NULL_TREE);
9640   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
9641                   NULL_TREE);
9642   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
9643                   NULL_TREE);
9644   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
9645                   NULL_TREE);
9646   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
9647                   NULL_TREE);
9648
9649   /* Emit code to initialize STACK, which points to the next varargs stack
9650      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
9651      by named arguments.  STACK is 8-byte aligned.  */
9652   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
9653   if (cum->aapcs_stack_size > 0)
9654     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
9655   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
9656   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9657
9658   /* Emit code to initialize GRTOP, the top of the GR save area.
9659      virtual_incoming_args_rtx should have been 16 byte aligned.  */
9660   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
9661   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
9662   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9663
9664   /* Emit code to initialize VRTOP, the top of the VR save area.
9665      This address is gr_save_area_bytes below GRTOP, rounded
9666      down to the next 16-byte boundary.  */
9667   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
9668   vr_offset = ROUND_UP (gr_save_area_size,
9669                         STACK_BOUNDARY / BITS_PER_UNIT);
9670
9671   if (vr_offset)
9672     t = fold_build_pointer_plus_hwi (t, -vr_offset);
9673   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
9674   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9675
9676   /* Emit code to initialize GROFF, the offset from GRTOP of the
9677      next GPR argument.  */
9678   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
9679               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
9680   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9681
9682   /* Likewise emit code to initialize VROFF, the offset from FTOP
9683      of the next VR argument.  */
9684   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
9685               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
9686   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9687 }
9688
9689 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
9690
9691 static tree
9692 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
9693                               gimple_seq *post_p ATTRIBUTE_UNUSED)
9694 {
9695   tree addr;
9696   bool indirect_p;
9697   bool is_ha;           /* is HFA or HVA.  */
9698   bool dw_align;        /* double-word align.  */
9699   machine_mode ag_mode = VOIDmode;
9700   int nregs;
9701   machine_mode mode;
9702
9703   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9704   tree stack, f_top, f_off, off, arg, roundup, on_stack;
9705   HOST_WIDE_INT size, rsize, adjust, align;
9706   tree t, u, cond1, cond2;
9707
9708   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9709   if (indirect_p)
9710     type = build_pointer_type (type);
9711
9712   mode = TYPE_MODE (type);
9713
9714   f_stack = TYPE_FIELDS (va_list_type_node);
9715   f_grtop = DECL_CHAIN (f_stack);
9716   f_vrtop = DECL_CHAIN (f_grtop);
9717   f_groff = DECL_CHAIN (f_vrtop);
9718   f_vroff = DECL_CHAIN (f_groff);
9719
9720   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
9721                   f_stack, NULL_TREE);
9722   size = int_size_in_bytes (type);
9723   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9724
9725   dw_align = false;
9726   adjust = 0;
9727   if (aarch64_vfp_is_call_or_return_candidate (mode,
9728                                                type,
9729                                                &ag_mode,
9730                                                &nregs,
9731                                                &is_ha))
9732     {
9733       /* TYPE passed in fp/simd registers.  */
9734       if (!TARGET_FLOAT)
9735         aarch64_err_no_fpadvsimd (mode, "varargs");
9736
9737       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9738                       unshare_expr (valist), f_vrtop, NULL_TREE);
9739       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9740                       unshare_expr (valist), f_vroff, NULL_TREE);
9741
9742       rsize = nregs * UNITS_PER_VREG;
9743
9744       if (is_ha)
9745         {
9746           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9747             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9748         }
9749       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9750                && size < UNITS_PER_VREG)
9751         {
9752           adjust = UNITS_PER_VREG - size;
9753         }
9754     }
9755   else
9756     {
9757       /* TYPE passed in general registers.  */
9758       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9759                       unshare_expr (valist), f_grtop, NULL_TREE);
9760       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9761                       unshare_expr (valist), f_groff, NULL_TREE);
9762       rsize = ROUND_UP (size, UNITS_PER_WORD);
9763       nregs = rsize / UNITS_PER_WORD;
9764
9765       if (align > 8)
9766         dw_align = true;
9767
9768       if (BLOCK_REG_PADDING (mode, type, 1) == downward
9769           && size < UNITS_PER_WORD)
9770         {
9771           adjust = UNITS_PER_WORD  - size;
9772         }
9773     }
9774
9775   /* Get a local temporary for the field value.  */
9776   off = get_initialized_tmp_var (f_off, pre_p, NULL);
9777
9778   /* Emit code to branch if off >= 0.  */
9779   t = build2 (GE_EXPR, boolean_type_node, off,
9780               build_int_cst (TREE_TYPE (off), 0));
9781   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9782
9783   if (dw_align)
9784     {
9785       /* Emit: offs = (offs + 15) & -16.  */
9786       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9787                   build_int_cst (TREE_TYPE (off), 15));
9788       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9789                   build_int_cst (TREE_TYPE (off), -16));
9790       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9791     }
9792   else
9793     roundup = NULL;
9794
9795   /* Update ap.__[g|v]r_offs  */
9796   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9797               build_int_cst (TREE_TYPE (off), rsize));
9798   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9799
9800   /* String up.  */
9801   if (roundup)
9802     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9803
9804   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
9805   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9806               build_int_cst (TREE_TYPE (f_off), 0));
9807   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9808
9809   /* String up: make sure the assignment happens before the use.  */
9810   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9811   COND_EXPR_ELSE (cond1) = t;
9812
9813   /* Prepare the trees handling the argument that is passed on the stack;
9814      the top level node will store in ON_STACK.  */
9815   arg = get_initialized_tmp_var (stack, pre_p, NULL);
9816   if (align > 8)
9817     {
9818       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
9819       t = fold_convert (intDI_type_node, arg);
9820       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9821                   build_int_cst (TREE_TYPE (t), 15));
9822       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9823                   build_int_cst (TREE_TYPE (t), -16));
9824       t = fold_convert (TREE_TYPE (arg), t);
9825       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9826     }
9827   else
9828     roundup = NULL;
9829   /* Advance ap.__stack  */
9830   t = fold_convert (intDI_type_node, arg);
9831   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9832               build_int_cst (TREE_TYPE (t), size + 7));
9833   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9834               build_int_cst (TREE_TYPE (t), -8));
9835   t = fold_convert (TREE_TYPE (arg), t);
9836   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9837   /* String up roundup and advance.  */
9838   if (roundup)
9839     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9840   /* String up with arg */
9841   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9842   /* Big-endianness related address adjustment.  */
9843   if (BLOCK_REG_PADDING (mode, type, 1) == downward
9844       && size < UNITS_PER_WORD)
9845   {
9846     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9847                 size_int (UNITS_PER_WORD - size));
9848     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9849   }
9850
9851   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9852   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9853
9854   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
9855   t = off;
9856   if (adjust)
9857     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
9858                 build_int_cst (TREE_TYPE (off), adjust));
9859
9860   t = fold_convert (sizetype, t);
9861   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
9862
9863   if (is_ha)
9864     {
9865       /* type ha; // treat as "struct {ftype field[n];}"
9866          ... [computing offs]
9867          for (i = 0; i <nregs; ++i, offs += 16)
9868            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
9869          return ha;  */
9870       int i;
9871       tree tmp_ha, field_t, field_ptr_t;
9872
9873       /* Declare a local variable.  */
9874       tmp_ha = create_tmp_var_raw (type, "ha");
9875       gimple_add_tmp_var (tmp_ha);
9876
9877       /* Establish the base type.  */
9878       switch (ag_mode)
9879         {
9880         case SFmode:
9881           field_t = float_type_node;
9882           field_ptr_t = float_ptr_type_node;
9883           break;
9884         case DFmode:
9885           field_t = double_type_node;
9886           field_ptr_t = double_ptr_type_node;
9887           break;
9888         case TFmode:
9889           field_t = long_double_type_node;
9890           field_ptr_t = long_double_ptr_type_node;
9891           break;
9892         case HFmode:
9893           field_t = aarch64_fp16_type_node;
9894           field_ptr_t = aarch64_fp16_ptr_type_node;
9895           break;
9896         case V2SImode:
9897         case V4SImode:
9898             {
9899               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
9900               field_t = build_vector_type_for_mode (innertype, ag_mode);
9901               field_ptr_t = build_pointer_type (field_t);
9902             }
9903           break;
9904         default:
9905           gcc_assert (0);
9906         }
9907
9908       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
9909       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
9910       addr = t;
9911       t = fold_convert (field_ptr_t, addr);
9912       t = build2 (MODIFY_EXPR, field_t,
9913                   build1 (INDIRECT_REF, field_t, tmp_ha),
9914                   build1 (INDIRECT_REF, field_t, t));
9915
9916       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
9917       for (i = 1; i < nregs; ++i)
9918         {
9919           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
9920           u = fold_convert (field_ptr_t, addr);
9921           u = build2 (MODIFY_EXPR, field_t,
9922                       build2 (MEM_REF, field_t, tmp_ha,
9923                               build_int_cst (field_ptr_t,
9924                                              (i *
9925                                               int_size_in_bytes (field_t)))),
9926                       build1 (INDIRECT_REF, field_t, u));
9927           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
9928         }
9929
9930       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
9931       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
9932     }
9933
9934   COND_EXPR_ELSE (cond2) = t;
9935   addr = fold_convert (build_pointer_type (type), cond1);
9936   addr = build_va_arg_indirect_ref (addr);
9937
9938   if (indirect_p)
9939     addr = build_va_arg_indirect_ref (addr);
9940
9941   return addr;
9942 }
9943
9944 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
9945
9946 static void
9947 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9948                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
9949                                 int no_rtl)
9950 {
9951   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9952   CUMULATIVE_ARGS local_cum;
9953   int gr_saved = cfun->va_list_gpr_size;
9954   int vr_saved = cfun->va_list_fpr_size;
9955
9956   /* The caller has advanced CUM up to, but not beyond, the last named
9957      argument.  Advance a local copy of CUM past the last "real" named
9958      argument, to find out how many registers are left over.  */
9959   local_cum = *cum;
9960   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
9961
9962   /* Found out how many registers we need to save.
9963      Honor tree-stdvar analysis results.  */
9964   if (cfun->va_list_gpr_size)
9965     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
9966                     cfun->va_list_gpr_size / UNITS_PER_WORD);
9967   if (cfun->va_list_fpr_size)
9968     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
9969                     cfun->va_list_fpr_size / UNITS_PER_VREG);
9970
9971   if (!TARGET_FLOAT)
9972     {
9973       gcc_assert (local_cum.aapcs_nvrn == 0);
9974       vr_saved = 0;
9975     }
9976
9977   if (!no_rtl)
9978     {
9979       if (gr_saved > 0)
9980         {
9981           rtx ptr, mem;
9982
9983           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
9984           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
9985                                - gr_saved * UNITS_PER_WORD);
9986           mem = gen_frame_mem (BLKmode, ptr);
9987           set_mem_alias_set (mem, get_varargs_alias_set ());
9988
9989           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
9990                                mem, gr_saved);
9991         }
9992       if (vr_saved > 0)
9993         {
9994           /* We can't use move_block_from_reg, because it will use
9995              the wrong mode, storing D regs only.  */
9996           machine_mode mode = TImode;
9997           int off, i, vr_start;
9998
9999           /* Set OFF to the offset from virtual_incoming_args_rtx of
10000              the first vector register.  The VR save area lies below
10001              the GR one, and is aligned to 16 bytes.  */
10002           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10003                            STACK_BOUNDARY / BITS_PER_UNIT);
10004           off -= vr_saved * UNITS_PER_VREG;
10005
10006           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10007           for (i = 0; i < vr_saved; ++i)
10008             {
10009               rtx ptr, mem;
10010
10011               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10012               mem = gen_frame_mem (mode, ptr);
10013               set_mem_alias_set (mem, get_varargs_alias_set ());
10014               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10015               off += UNITS_PER_VREG;
10016             }
10017         }
10018     }
10019
10020   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10021      any complication of having crtl->args.pretend_args_size changed.  */
10022   cfun->machine->frame.saved_varargs_size
10023     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10024                  STACK_BOUNDARY / BITS_PER_UNIT)
10025        + vr_saved * UNITS_PER_VREG);
10026 }
10027
10028 static void
10029 aarch64_conditional_register_usage (void)
10030 {
10031   int i;
10032   if (!TARGET_FLOAT)
10033     {
10034       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10035         {
10036           fixed_regs[i] = 1;
10037           call_used_regs[i] = 1;
10038         }
10039     }
10040 }
10041
10042 /* Walk down the type tree of TYPE counting consecutive base elements.
10043    If *MODEP is VOIDmode, then set it to the first valid floating point
10044    type.  If a non-floating point type is found, or if a floating point
10045    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10046    otherwise return the count in the sub-tree.  */
10047 static int
10048 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10049 {
10050   machine_mode mode;
10051   HOST_WIDE_INT size;
10052
10053   switch (TREE_CODE (type))
10054     {
10055     case REAL_TYPE:
10056       mode = TYPE_MODE (type);
10057       if (mode != DFmode && mode != SFmode
10058           && mode != TFmode && mode != HFmode)
10059         return -1;
10060
10061       if (*modep == VOIDmode)
10062         *modep = mode;
10063
10064       if (*modep == mode)
10065         return 1;
10066
10067       break;
10068
10069     case COMPLEX_TYPE:
10070       mode = TYPE_MODE (TREE_TYPE (type));
10071       if (mode != DFmode && mode != SFmode
10072           && mode != TFmode && mode != HFmode)
10073         return -1;
10074
10075       if (*modep == VOIDmode)
10076         *modep = mode;
10077
10078       if (*modep == mode)
10079         return 2;
10080
10081       break;
10082
10083     case VECTOR_TYPE:
10084       /* Use V2SImode and V4SImode as representatives of all 64-bit
10085          and 128-bit vector types.  */
10086       size = int_size_in_bytes (type);
10087       switch (size)
10088         {
10089         case 8:
10090           mode = V2SImode;
10091           break;
10092         case 16:
10093           mode = V4SImode;
10094           break;
10095         default:
10096           return -1;
10097         }
10098
10099       if (*modep == VOIDmode)
10100         *modep = mode;
10101
10102       /* Vector modes are considered to be opaque: two vectors are
10103          equivalent for the purposes of being homogeneous aggregates
10104          if they are the same size.  */
10105       if (*modep == mode)
10106         return 1;
10107
10108       break;
10109
10110     case ARRAY_TYPE:
10111       {
10112         int count;
10113         tree index = TYPE_DOMAIN (type);
10114
10115         /* Can't handle incomplete types nor sizes that are not
10116            fixed.  */
10117         if (!COMPLETE_TYPE_P (type)
10118             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10119           return -1;
10120
10121         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10122         if (count == -1
10123             || !index
10124             || !TYPE_MAX_VALUE (index)
10125             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10126             || !TYPE_MIN_VALUE (index)
10127             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10128             || count < 0)
10129           return -1;
10130
10131         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10132                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10133
10134         /* There must be no padding.  */
10135         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10136           return -1;
10137
10138         return count;
10139       }
10140
10141     case RECORD_TYPE:
10142       {
10143         int count = 0;
10144         int sub_count;
10145         tree field;
10146
10147         /* Can't handle incomplete types nor sizes that are not
10148            fixed.  */
10149         if (!COMPLETE_TYPE_P (type)
10150             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10151           return -1;
10152
10153         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10154           {
10155             if (TREE_CODE (field) != FIELD_DECL)
10156               continue;
10157
10158             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10159             if (sub_count < 0)
10160               return -1;
10161             count += sub_count;
10162           }
10163
10164         /* There must be no padding.  */
10165         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10166           return -1;
10167
10168         return count;
10169       }
10170
10171     case UNION_TYPE:
10172     case QUAL_UNION_TYPE:
10173       {
10174         /* These aren't very interesting except in a degenerate case.  */
10175         int count = 0;
10176         int sub_count;
10177         tree field;
10178
10179         /* Can't handle incomplete types nor sizes that are not
10180            fixed.  */
10181         if (!COMPLETE_TYPE_P (type)
10182             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10183           return -1;
10184
10185         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10186           {
10187             if (TREE_CODE (field) != FIELD_DECL)
10188               continue;
10189
10190             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10191             if (sub_count < 0)
10192               return -1;
10193             count = count > sub_count ? count : sub_count;
10194           }
10195
10196         /* There must be no padding.  */
10197         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10198           return -1;
10199
10200         return count;
10201       }
10202
10203     default:
10204       break;
10205     }
10206
10207   return -1;
10208 }
10209
10210 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10211    type as described in AAPCS64 \S 4.1.2.
10212
10213    See the comment above aarch64_composite_type_p for the notes on MODE.  */
10214
10215 static bool
10216 aarch64_short_vector_p (const_tree type,
10217                         machine_mode mode)
10218 {
10219   HOST_WIDE_INT size = -1;
10220
10221   if (type && TREE_CODE (type) == VECTOR_TYPE)
10222     size = int_size_in_bytes (type);
10223   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10224             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10225     size = GET_MODE_SIZE (mode);
10226
10227   return (size == 8 || size == 16);
10228 }
10229
10230 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10231    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
10232    array types.  The C99 floating-point complex types are also considered
10233    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
10234    types, which are GCC extensions and out of the scope of AAPCS64, are
10235    treated as composite types here as well.
10236
10237    Note that MODE itself is not sufficient in determining whether a type
10238    is such a composite type or not.  This is because
10239    stor-layout.c:compute_record_mode may have already changed the MODE
10240    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
10241    structure with only one field may have its MODE set to the mode of the
10242    field.  Also an integer mode whose size matches the size of the
10243    RECORD_TYPE type may be used to substitute the original mode
10244    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
10245    solely relied on.  */
10246
10247 static bool
10248 aarch64_composite_type_p (const_tree type,
10249                           machine_mode mode)
10250 {
10251   if (aarch64_short_vector_p (type, mode))
10252     return false;
10253
10254   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10255     return true;
10256
10257   if (mode == BLKmode
10258       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10259       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10260     return true;
10261
10262   return false;
10263 }
10264
10265 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10266    shall be passed or returned in simd/fp register(s) (providing these
10267    parameter passing registers are available).
10268
10269    Upon successful return, *COUNT returns the number of needed registers,
10270    *BASE_MODE returns the mode of the individual register and when IS_HAF
10271    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10272    floating-point aggregate or a homogeneous short-vector aggregate.  */
10273
10274 static bool
10275 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10276                                          const_tree type,
10277                                          machine_mode *base_mode,
10278                                          int *count,
10279                                          bool *is_ha)
10280 {
10281   machine_mode new_mode = VOIDmode;
10282   bool composite_p = aarch64_composite_type_p (type, mode);
10283
10284   if (is_ha != NULL) *is_ha = false;
10285
10286   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10287       || aarch64_short_vector_p (type, mode))
10288     {
10289       *count = 1;
10290       new_mode = mode;
10291     }
10292   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10293     {
10294       if (is_ha != NULL) *is_ha = true;
10295       *count = 2;
10296       new_mode = GET_MODE_INNER (mode);
10297     }
10298   else if (type && composite_p)
10299     {
10300       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10301
10302       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10303         {
10304           if (is_ha != NULL) *is_ha = true;
10305           *count = ag_count;
10306         }
10307       else
10308         return false;
10309     }
10310   else
10311     return false;
10312
10313   *base_mode = new_mode;
10314   return true;
10315 }
10316
10317 /* Implement TARGET_STRUCT_VALUE_RTX.  */
10318
10319 static rtx
10320 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10321                           int incoming ATTRIBUTE_UNUSED)
10322 {
10323   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10324 }
10325
10326 /* Implements target hook vector_mode_supported_p.  */
10327 static bool
10328 aarch64_vector_mode_supported_p (machine_mode mode)
10329 {
10330   if (TARGET_SIMD
10331       && (mode == V4SImode  || mode == V8HImode
10332           || mode == V16QImode || mode == V2DImode
10333           || mode == V2SImode  || mode == V4HImode
10334           || mode == V8QImode || mode == V2SFmode
10335           || mode == V4SFmode || mode == V2DFmode
10336           || mode == V4HFmode || mode == V8HFmode
10337           || mode == V1DFmode))
10338     return true;
10339
10340   return false;
10341 }
10342
10343 /* Return appropriate SIMD container
10344    for MODE within a vector of WIDTH bits.  */
10345 static machine_mode
10346 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10347 {
10348   gcc_assert (width == 64 || width == 128);
10349   if (TARGET_SIMD)
10350     {
10351       if (width == 128)
10352         switch (mode)
10353           {
10354           case DFmode:
10355             return V2DFmode;
10356           case SFmode:
10357             return V4SFmode;
10358           case SImode:
10359             return V4SImode;
10360           case HImode:
10361             return V8HImode;
10362           case QImode:
10363             return V16QImode;
10364           case DImode:
10365             return V2DImode;
10366           default:
10367             break;
10368           }
10369       else
10370         switch (mode)
10371           {
10372           case SFmode:
10373             return V2SFmode;
10374           case SImode:
10375             return V2SImode;
10376           case HImode:
10377             return V4HImode;
10378           case QImode:
10379             return V8QImode;
10380           default:
10381             break;
10382           }
10383     }
10384   return word_mode;
10385 }
10386
10387 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
10388 static machine_mode
10389 aarch64_preferred_simd_mode (machine_mode mode)
10390 {
10391   return aarch64_simd_container_mode (mode, 128);
10392 }
10393
10394 /* Return the bitmask of possible vector sizes for the vectorizer
10395    to iterate over.  */
10396 static unsigned int
10397 aarch64_autovectorize_vector_sizes (void)
10398 {
10399   return (16 | 8);
10400 }
10401
10402 /* Implement TARGET_MANGLE_TYPE.  */
10403
10404 static const char *
10405 aarch64_mangle_type (const_tree type)
10406 {
10407   /* The AArch64 ABI documents say that "__va_list" has to be
10408      managled as if it is in the "std" namespace.  */
10409   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10410     return "St9__va_list";
10411
10412   /* Half-precision float.  */
10413   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10414     return "Dh";
10415
10416   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
10417      builtin types.  */
10418   if (TYPE_NAME (type) != NULL)
10419     return aarch64_mangle_builtin_type (type);
10420
10421   /* Use the default mangling.  */
10422   return NULL;
10423 }
10424
10425
10426 /* Return true if the rtx_insn contains a MEM RTX somewhere
10427    in it.  */
10428
10429 static bool
10430 has_memory_op (rtx_insn *mem_insn)
10431 {
10432   subrtx_iterator::array_type array;
10433   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10434     if (MEM_P (*iter))
10435       return true;
10436
10437   return false;
10438 }
10439
10440 /* Find the first rtx_insn before insn that will generate an assembly
10441    instruction.  */
10442
10443 static rtx_insn *
10444 aarch64_prev_real_insn (rtx_insn *insn)
10445 {
10446   if (!insn)
10447     return NULL;
10448
10449   do
10450     {
10451       insn = prev_real_insn (insn);
10452     }
10453   while (insn && recog_memoized (insn) < 0);
10454
10455   return insn;
10456 }
10457
10458 static bool
10459 is_madd_op (enum attr_type t1)
10460 {
10461   unsigned int i;
10462   /* A number of these may be AArch32 only.  */
10463   enum attr_type mlatypes[] = {
10464     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10465     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10466     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10467   };
10468
10469   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10470     {
10471       if (t1 == mlatypes[i])
10472         return true;
10473     }
10474
10475   return false;
10476 }
10477
10478 /* Check if there is a register dependency between a load and the insn
10479    for which we hold recog_data.  */
10480
10481 static bool
10482 dep_between_memop_and_curr (rtx memop)
10483 {
10484   rtx load_reg;
10485   int opno;
10486
10487   gcc_assert (GET_CODE (memop) == SET);
10488
10489   if (!REG_P (SET_DEST (memop)))
10490     return false;
10491
10492   load_reg = SET_DEST (memop);
10493   for (opno = 1; opno < recog_data.n_operands; opno++)
10494     {
10495       rtx operand = recog_data.operand[opno];
10496       if (REG_P (operand)
10497           && reg_overlap_mentioned_p (load_reg, operand))
10498         return true;
10499
10500     }
10501   return false;
10502 }
10503
10504
10505 /* When working around the Cortex-A53 erratum 835769,
10506    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10507    instruction and has a preceding memory instruction such that a NOP
10508    should be inserted between them.  */
10509
10510 bool
10511 aarch64_madd_needs_nop (rtx_insn* insn)
10512 {
10513   enum attr_type attr_type;
10514   rtx_insn *prev;
10515   rtx body;
10516
10517   if (!TARGET_FIX_ERR_A53_835769)
10518     return false;
10519
10520   if (!INSN_P (insn) || recog_memoized (insn) < 0)
10521     return false;
10522
10523   attr_type = get_attr_type (insn);
10524   if (!is_madd_op (attr_type))
10525     return false;
10526
10527   prev = aarch64_prev_real_insn (insn);
10528   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10529      Restore recog state to INSN to avoid state corruption.  */
10530   extract_constrain_insn_cached (insn);
10531
10532   if (!prev || !has_memory_op (prev))
10533     return false;
10534
10535   body = single_set (prev);
10536
10537   /* If the previous insn is a memory op and there is no dependency between
10538      it and the DImode madd, emit a NOP between them.  If body is NULL then we
10539      have a complex memory operation, probably a load/store pair.
10540      Be conservative for now and emit a NOP.  */
10541   if (GET_MODE (recog_data.operand[0]) == DImode
10542       && (!body || !dep_between_memop_and_curr (body)))
10543     return true;
10544
10545   return false;
10546
10547 }
10548
10549
10550 /* Implement FINAL_PRESCAN_INSN.  */
10551
10552 void
10553 aarch64_final_prescan_insn (rtx_insn *insn)
10554 {
10555   if (aarch64_madd_needs_nop (insn))
10556     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10557 }
10558
10559
10560 /* Return the equivalent letter for size.  */
10561 static char
10562 sizetochar (int size)
10563 {
10564   switch (size)
10565     {
10566     case 64: return 'd';
10567     case 32: return 's';
10568     case 16: return 'h';
10569     case 8 : return 'b';
10570     default: gcc_unreachable ();
10571     }
10572 }
10573
10574 /* Return true iff x is a uniform vector of floating-point
10575    constants, and the constant can be represented in
10576    quarter-precision form.  Note, as aarch64_float_const_representable
10577    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
10578 static bool
10579 aarch64_vect_float_const_representable_p (rtx x)
10580 {
10581   rtx elt;
10582   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10583           && const_vec_duplicate_p (x, &elt)
10584           && aarch64_float_const_representable_p (elt));
10585 }
10586
10587 /* Return true for valid and false for invalid.  */
10588 bool
10589 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10590                               struct simd_immediate_info *info)
10591 {
10592 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
10593   matches = 1;                                          \
10594   for (i = 0; i < idx; i += (STRIDE))                   \
10595     if (!(TEST))                                        \
10596       matches = 0;                                      \
10597   if (matches)                                          \
10598     {                                                   \
10599       immtype = (CLASS);                                \
10600       elsize = (ELSIZE);                                \
10601       eshift = (SHIFT);                                 \
10602       emvn = (NEG);                                     \
10603       break;                                            \
10604     }
10605
10606   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
10607   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
10608   unsigned char bytes[16];
10609   int immtype = -1, matches;
10610   unsigned int invmask = inverse ? 0xff : 0;
10611   int eshift, emvn;
10612
10613   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10614     {
10615       if (! (aarch64_simd_imm_zero_p (op, mode)
10616              || aarch64_vect_float_const_representable_p (op)))
10617         return false;
10618
10619       if (info)
10620         {
10621           info->value = CONST_VECTOR_ELT (op, 0);
10622           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
10623           info->mvn = false;
10624           info->shift = 0;
10625         }
10626
10627       return true;
10628     }
10629
10630   /* Splat vector constant out into a byte vector.  */
10631   for (i = 0; i < n_elts; i++)
10632     {
10633       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
10634          it must be laid out in the vector register in reverse order.  */
10635       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
10636       unsigned HOST_WIDE_INT elpart;
10637
10638       gcc_assert (CONST_INT_P (el));
10639       elpart = INTVAL (el);
10640
10641       for (unsigned int byte = 0; byte < innersize; byte++)
10642         {
10643           bytes[idx++] = (elpart & 0xff) ^ invmask;
10644           elpart >>= BITS_PER_UNIT;
10645         }
10646
10647     }
10648
10649   /* Sanity check.  */
10650   gcc_assert (idx == GET_MODE_SIZE (mode));
10651
10652   do
10653     {
10654       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
10655              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
10656
10657       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10658              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10659
10660       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
10661              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10662
10663       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
10664              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
10665
10666       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
10667
10668       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
10669
10670       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
10671              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
10672
10673       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10674              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10675
10676       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
10677              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10678
10679       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
10680              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
10681
10682       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
10683
10684       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
10685
10686       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10687              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10688
10689       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10690              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10691
10692       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10693              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10694
10695       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10696              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10697
10698       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10699
10700       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10701              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10702     }
10703   while (0);
10704
10705   if (immtype == -1)
10706     return false;
10707
10708   if (info)
10709     {
10710       info->element_width = elsize;
10711       info->mvn = emvn != 0;
10712       info->shift = eshift;
10713
10714       unsigned HOST_WIDE_INT imm = 0;
10715
10716       if (immtype >= 12 && immtype <= 15)
10717         info->msl = true;
10718
10719       /* Un-invert bytes of recognized vector, if necessary.  */
10720       if (invmask != 0)
10721         for (i = 0; i < idx; i++)
10722           bytes[i] ^= invmask;
10723
10724       if (immtype == 17)
10725         {
10726           /* FIXME: Broken on 32-bit H_W_I hosts.  */
10727           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10728
10729           for (i = 0; i < 8; i++)
10730             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10731               << (i * BITS_PER_UNIT);
10732
10733
10734           info->value = GEN_INT (imm);
10735         }
10736       else
10737         {
10738           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10739             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10740
10741           /* Construct 'abcdefgh' because the assembler cannot handle
10742              generic constants.  */
10743           if (info->mvn)
10744             imm = ~imm;
10745           imm = (imm >> info->shift) & 0xff;
10746           info->value = GEN_INT (imm);
10747         }
10748     }
10749
10750   return true;
10751 #undef CHECK
10752 }
10753
10754 /* Check of immediate shift constants are within range.  */
10755 bool
10756 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10757 {
10758   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10759   if (left)
10760     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10761   else
10762     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10763 }
10764
10765 /* Return true if X is a uniform vector where all elements
10766    are either the floating-point constant 0.0 or the
10767    integer constant 0.  */
10768 bool
10769 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10770 {
10771   return x == CONST0_RTX (mode);
10772 }
10773
10774
10775 /* Return the bitmask CONST_INT to select the bits required by a zero extract
10776    operation of width WIDTH at bit position POS.  */
10777
10778 rtx
10779 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
10780 {
10781   gcc_assert (CONST_INT_P (width));
10782   gcc_assert (CONST_INT_P (pos));
10783
10784   unsigned HOST_WIDE_INT mask
10785     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
10786   return GEN_INT (mask << UINTVAL (pos));
10787 }
10788
10789 bool
10790 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10791 {
10792   HOST_WIDE_INT imm = INTVAL (x);
10793   int i;
10794
10795   for (i = 0; i < 8; i++)
10796     {
10797       unsigned int byte = imm & 0xff;
10798       if (byte != 0xff && byte != 0)
10799        return false;
10800       imm >>= 8;
10801     }
10802
10803   return true;
10804 }
10805
10806 bool
10807 aarch64_mov_operand_p (rtx x, machine_mode mode)
10808 {
10809   if (GET_CODE (x) == HIGH
10810       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10811     return true;
10812
10813   if (CONST_INT_P (x))
10814     return true;
10815
10816   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10817     return true;
10818
10819   return aarch64_classify_symbolic_expression (x)
10820     == SYMBOL_TINY_ABSOLUTE;
10821 }
10822
10823 /* Return a const_int vector of VAL.  */
10824 rtx
10825 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10826 {
10827   int nunits = GET_MODE_NUNITS (mode);
10828   rtvec v = rtvec_alloc (nunits);
10829   int i;
10830
10831   for (i=0; i < nunits; i++)
10832     RTVEC_ELT (v, i) = GEN_INT (val);
10833
10834   return gen_rtx_CONST_VECTOR (mode, v);
10835 }
10836
10837 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
10838
10839 bool
10840 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10841 {
10842   machine_mode vmode;
10843
10844   gcc_assert (!VECTOR_MODE_P (mode));
10845   vmode = aarch64_preferred_simd_mode (mode);
10846   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10847   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10848 }
10849
10850 /* Construct and return a PARALLEL RTX vector with elements numbering the
10851    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10852    the vector - from the perspective of the architecture.  This does not
10853    line up with GCC's perspective on lane numbers, so we end up with
10854    different masks depending on our target endian-ness.  The diagram
10855    below may help.  We must draw the distinction when building masks
10856    which select one half of the vector.  An instruction selecting
10857    architectural low-lanes for a big-endian target, must be described using
10858    a mask selecting GCC high-lanes.
10859
10860                  Big-Endian             Little-Endian
10861
10862 GCC             0   1   2   3           3   2   1   0
10863               | x | x | x | x |       | x | x | x | x |
10864 Architecture    3   2   1   0           3   2   1   0
10865
10866 Low Mask:         { 2, 3 }                { 0, 1 }
10867 High Mask:        { 0, 1 }                { 2, 3 }
10868 */
10869
10870 rtx
10871 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
10872 {
10873   int nunits = GET_MODE_NUNITS (mode);
10874   rtvec v = rtvec_alloc (nunits / 2);
10875   int high_base = nunits / 2;
10876   int low_base = 0;
10877   int base;
10878   rtx t1;
10879   int i;
10880
10881   if (BYTES_BIG_ENDIAN)
10882     base = high ? low_base : high_base;
10883   else
10884     base = high ? high_base : low_base;
10885
10886   for (i = 0; i < nunits / 2; i++)
10887     RTVEC_ELT (v, i) = GEN_INT (base + i);
10888
10889   t1 = gen_rtx_PARALLEL (mode, v);
10890   return t1;
10891 }
10892
10893 /* Check OP for validity as a PARALLEL RTX vector with elements
10894    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
10895    from the perspective of the architecture.  See the diagram above
10896    aarch64_simd_vect_par_cnst_half for more details.  */
10897
10898 bool
10899 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
10900                                        bool high)
10901 {
10902   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
10903   HOST_WIDE_INT count_op = XVECLEN (op, 0);
10904   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
10905   int i = 0;
10906
10907   if (!VECTOR_MODE_P (mode))
10908     return false;
10909
10910   if (count_op != count_ideal)
10911     return false;
10912
10913   for (i = 0; i < count_ideal; i++)
10914     {
10915       rtx elt_op = XVECEXP (op, 0, i);
10916       rtx elt_ideal = XVECEXP (ideal, 0, i);
10917
10918       if (!CONST_INT_P (elt_op)
10919           || INTVAL (elt_ideal) != INTVAL (elt_op))
10920         return false;
10921     }
10922   return true;
10923 }
10924
10925 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
10926    HIGH (exclusive).  */
10927 void
10928 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
10929                           const_tree exp)
10930 {
10931   HOST_WIDE_INT lane;
10932   gcc_assert (CONST_INT_P (operand));
10933   lane = INTVAL (operand);
10934
10935   if (lane < low || lane >= high)
10936   {
10937     if (exp)
10938       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
10939     else
10940       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
10941   }
10942 }
10943
10944 /* Return TRUE if OP is a valid vector addressing mode.  */
10945 bool
10946 aarch64_simd_mem_operand_p (rtx op)
10947 {
10948   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
10949                         || REG_P (XEXP (op, 0)));
10950 }
10951
10952 /* Emit a register copy from operand to operand, taking care not to
10953    early-clobber source registers in the process.
10954
10955    COUNT is the number of components into which the copy needs to be
10956    decomposed.  */
10957 void
10958 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
10959                                 unsigned int count)
10960 {
10961   unsigned int i;
10962   int rdest = REGNO (operands[0]);
10963   int rsrc = REGNO (operands[1]);
10964
10965   if (!reg_overlap_mentioned_p (operands[0], operands[1])
10966       || rdest < rsrc)
10967     for (i = 0; i < count; i++)
10968       emit_move_insn (gen_rtx_REG (mode, rdest + i),
10969                       gen_rtx_REG (mode, rsrc + i));
10970   else
10971     for (i = 0; i < count; i++)
10972       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
10973                       gen_rtx_REG (mode, rsrc + count - i - 1));
10974 }
10975
10976 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
10977    one of VSTRUCT modes: OI, CI, or XI.  */
10978 int
10979 aarch64_simd_attr_length_rglist (enum machine_mode mode)
10980 {
10981   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
10982 }
10983
10984 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
10985    alignment of a vector to 128 bits.  */
10986 static HOST_WIDE_INT
10987 aarch64_simd_vector_alignment (const_tree type)
10988 {
10989   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
10990   return MIN (align, 128);
10991 }
10992
10993 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
10994 static bool
10995 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
10996 {
10997   if (is_packed)
10998     return false;
10999
11000   /* We guarantee alignment for vectors up to 128-bits.  */
11001   if (tree_int_cst_compare (TYPE_SIZE (type),
11002                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11003     return false;
11004
11005   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11006   return true;
11007 }
11008
11009 /* If VALS is a vector constant that can be loaded into a register
11010    using DUP, generate instructions to do so and return an RTX to
11011    assign to the register.  Otherwise return NULL_RTX.  */
11012 static rtx
11013 aarch64_simd_dup_constant (rtx vals)
11014 {
11015   machine_mode mode = GET_MODE (vals);
11016   machine_mode inner_mode = GET_MODE_INNER (mode);
11017   rtx x;
11018
11019   if (!const_vec_duplicate_p (vals, &x))
11020     return NULL_RTX;
11021
11022   /* We can load this constant by using DUP and a constant in a
11023      single ARM register.  This will be cheaper than a vector
11024      load.  */
11025   x = copy_to_mode_reg (inner_mode, x);
11026   return gen_rtx_VEC_DUPLICATE (mode, x);
11027 }
11028
11029
11030 /* Generate code to load VALS, which is a PARALLEL containing only
11031    constants (for vec_init) or CONST_VECTOR, efficiently into a
11032    register.  Returns an RTX to copy into the register, or NULL_RTX
11033    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11034 static rtx
11035 aarch64_simd_make_constant (rtx vals)
11036 {
11037   machine_mode mode = GET_MODE (vals);
11038   rtx const_dup;
11039   rtx const_vec = NULL_RTX;
11040   int n_elts = GET_MODE_NUNITS (mode);
11041   int n_const = 0;
11042   int i;
11043
11044   if (GET_CODE (vals) == CONST_VECTOR)
11045     const_vec = vals;
11046   else if (GET_CODE (vals) == PARALLEL)
11047     {
11048       /* A CONST_VECTOR must contain only CONST_INTs and
11049          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11050          Only store valid constants in a CONST_VECTOR.  */
11051       for (i = 0; i < n_elts; ++i)
11052         {
11053           rtx x = XVECEXP (vals, 0, i);
11054           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11055             n_const++;
11056         }
11057       if (n_const == n_elts)
11058         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11059     }
11060   else
11061     gcc_unreachable ();
11062
11063   if (const_vec != NULL_RTX
11064       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11065     /* Load using MOVI/MVNI.  */
11066     return const_vec;
11067   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11068     /* Loaded using DUP.  */
11069     return const_dup;
11070   else if (const_vec != NULL_RTX)
11071     /* Load from constant pool. We can not take advantage of single-cycle
11072        LD1 because we need a PC-relative addressing mode.  */
11073     return const_vec;
11074   else
11075     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11076        We can not construct an initializer.  */
11077     return NULL_RTX;
11078 }
11079
11080 /* Expand a vector initialisation sequence, such that TARGET is
11081    initialised to contain VALS.  */
11082
11083 void
11084 aarch64_expand_vector_init (rtx target, rtx vals)
11085 {
11086   machine_mode mode = GET_MODE (target);
11087   machine_mode inner_mode = GET_MODE_INNER (mode);
11088   /* The number of vector elements.  */
11089   int n_elts = GET_MODE_NUNITS (mode);
11090   /* The number of vector elements which are not constant.  */
11091   int n_var = 0;
11092   rtx any_const = NULL_RTX;
11093   /* The first element of vals.  */
11094   rtx v0 = XVECEXP (vals, 0, 0);
11095   bool all_same = true;
11096
11097   /* Count the number of variable elements to initialise.  */
11098   for (int i = 0; i < n_elts; ++i)
11099     {
11100       rtx x = XVECEXP (vals, 0, i);
11101       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11102         ++n_var;
11103       else
11104         any_const = x;
11105
11106       all_same &= rtx_equal_p (x, v0);
11107     }
11108
11109   /* No variable elements, hand off to aarch64_simd_make_constant which knows
11110      how best to handle this.  */
11111   if (n_var == 0)
11112     {
11113       rtx constant = aarch64_simd_make_constant (vals);
11114       if (constant != NULL_RTX)
11115         {
11116           emit_move_insn (target, constant);
11117           return;
11118         }
11119     }
11120
11121   /* Splat a single non-constant element if we can.  */
11122   if (all_same)
11123     {
11124       rtx x = copy_to_mode_reg (inner_mode, v0);
11125       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11126       return;
11127     }
11128
11129   /* Initialise a vector which is part-variable.  We want to first try
11130      to build those lanes which are constant in the most efficient way we
11131      can.  */
11132   if (n_var != n_elts)
11133     {
11134       rtx copy = copy_rtx (vals);
11135
11136       /* Load constant part of vector.  We really don't care what goes into the
11137          parts we will overwrite, but we're more likely to be able to load the
11138          constant efficiently if it has fewer, larger, repeating parts
11139          (see aarch64_simd_valid_immediate).  */
11140       for (int i = 0; i < n_elts; i++)
11141         {
11142           rtx x = XVECEXP (vals, 0, i);
11143           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11144             continue;
11145           rtx subst = any_const;
11146           for (int bit = n_elts / 2; bit > 0; bit /= 2)
11147             {
11148               /* Look in the copied vector, as more elements are const.  */
11149               rtx test = XVECEXP (copy, 0, i ^ bit);
11150               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11151                 {
11152                   subst = test;
11153                   break;
11154                 }
11155             }
11156           XVECEXP (copy, 0, i) = subst;
11157         }
11158       aarch64_expand_vector_init (target, copy);
11159     }
11160
11161   /* Insert the variable lanes directly.  */
11162
11163   enum insn_code icode = optab_handler (vec_set_optab, mode);
11164   gcc_assert (icode != CODE_FOR_nothing);
11165
11166   for (int i = 0; i < n_elts; i++)
11167     {
11168       rtx x = XVECEXP (vals, 0, i);
11169       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11170         continue;
11171       x = copy_to_mode_reg (inner_mode, x);
11172       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11173     }
11174 }
11175
11176 static unsigned HOST_WIDE_INT
11177 aarch64_shift_truncation_mask (machine_mode mode)
11178 {
11179   return
11180     (!SHIFT_COUNT_TRUNCATED
11181      || aarch64_vector_mode_supported_p (mode)
11182      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11183 }
11184
11185 /* Select a format to encode pointers in exception handling data.  */
11186 int
11187 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11188 {
11189    int type;
11190    switch (aarch64_cmodel)
11191      {
11192      case AARCH64_CMODEL_TINY:
11193      case AARCH64_CMODEL_TINY_PIC:
11194      case AARCH64_CMODEL_SMALL:
11195      case AARCH64_CMODEL_SMALL_PIC:
11196      case AARCH64_CMODEL_SMALL_SPIC:
11197        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
11198           for everything.  */
11199        type = DW_EH_PE_sdata4;
11200        break;
11201      default:
11202        /* No assumptions here.  8-byte relocs required.  */
11203        type = DW_EH_PE_sdata8;
11204        break;
11205      }
11206    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11207 }
11208
11209 /* The last .arch and .tune assembly strings that we printed.  */
11210 static std::string aarch64_last_printed_arch_string;
11211 static std::string aarch64_last_printed_tune_string;
11212
11213 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
11214    by the function fndecl.  */
11215
11216 void
11217 aarch64_declare_function_name (FILE *stream, const char* name,
11218                                 tree fndecl)
11219 {
11220   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11221
11222   struct cl_target_option *targ_options;
11223   if (target_parts)
11224     targ_options = TREE_TARGET_OPTION (target_parts);
11225   else
11226     targ_options = TREE_TARGET_OPTION (target_option_current_node);
11227   gcc_assert (targ_options);
11228
11229   const struct processor *this_arch
11230     = aarch64_get_arch (targ_options->x_explicit_arch);
11231
11232   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11233   std::string extension
11234     = aarch64_get_extension_string_for_isa_flags (isa_flags,
11235                                                   this_arch->flags);
11236   /* Only update the assembler .arch string if it is distinct from the last
11237      such string we printed.  */
11238   std::string to_print = this_arch->name + extension;
11239   if (to_print != aarch64_last_printed_arch_string)
11240     {
11241       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11242       aarch64_last_printed_arch_string = to_print;
11243     }
11244
11245   /* Print the cpu name we're tuning for in the comments, might be
11246      useful to readers of the generated asm.  Do it only when it changes
11247      from function to function and verbose assembly is requested.  */
11248   const struct processor *this_tune
11249     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11250
11251   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11252     {
11253       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11254                    this_tune->name);
11255       aarch64_last_printed_tune_string = this_tune->name;
11256     }
11257
11258   /* Don't forget the type directive for ELF.  */
11259   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11260   ASM_OUTPUT_LABEL (stream, name);
11261 }
11262
11263 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
11264
11265 static void
11266 aarch64_start_file (void)
11267 {
11268   struct cl_target_option *default_options
11269     = TREE_TARGET_OPTION (target_option_default_node);
11270
11271   const struct processor *default_arch
11272     = aarch64_get_arch (default_options->x_explicit_arch);
11273   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11274   std::string extension
11275     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11276                                                   default_arch->flags);
11277
11278    aarch64_last_printed_arch_string = default_arch->name + extension;
11279    aarch64_last_printed_tune_string = "";
11280    asm_fprintf (asm_out_file, "\t.arch %s\n",
11281                 aarch64_last_printed_arch_string.c_str ());
11282
11283    default_file_start ();
11284 }
11285
11286 /* Emit load exclusive.  */
11287
11288 static void
11289 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11290                              rtx mem, rtx model_rtx)
11291 {
11292   rtx (*gen) (rtx, rtx, rtx);
11293
11294   switch (mode)
11295     {
11296     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11297     case HImode: gen = gen_aarch64_load_exclusivehi; break;
11298     case SImode: gen = gen_aarch64_load_exclusivesi; break;
11299     case DImode: gen = gen_aarch64_load_exclusivedi; break;
11300     default:
11301       gcc_unreachable ();
11302     }
11303
11304   emit_insn (gen (rval, mem, model_rtx));
11305 }
11306
11307 /* Emit store exclusive.  */
11308
11309 static void
11310 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11311                               rtx rval, rtx mem, rtx model_rtx)
11312 {
11313   rtx (*gen) (rtx, rtx, rtx, rtx);
11314
11315   switch (mode)
11316     {
11317     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11318     case HImode: gen = gen_aarch64_store_exclusivehi; break;
11319     case SImode: gen = gen_aarch64_store_exclusivesi; break;
11320     case DImode: gen = gen_aarch64_store_exclusivedi; break;
11321     default:
11322       gcc_unreachable ();
11323     }
11324
11325   emit_insn (gen (bval, rval, mem, model_rtx));
11326 }
11327
11328 /* Mark the previous jump instruction as unlikely.  */
11329
11330 static void
11331 aarch64_emit_unlikely_jump (rtx insn)
11332 {
11333   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11334
11335   insn = emit_jump_insn (insn);
11336   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
11337 }
11338
11339 /* Expand a compare and swap pattern.  */
11340
11341 void
11342 aarch64_expand_compare_and_swap (rtx operands[])
11343 {
11344   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11345   machine_mode mode, cmp_mode;
11346   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11347   int idx;
11348   gen_cas_fn gen;
11349   const gen_cas_fn split_cas[] =
11350   {
11351     gen_aarch64_compare_and_swapqi,
11352     gen_aarch64_compare_and_swaphi,
11353     gen_aarch64_compare_and_swapsi,
11354     gen_aarch64_compare_and_swapdi
11355   };
11356   const gen_cas_fn atomic_cas[] =
11357   {
11358     gen_aarch64_compare_and_swapqi_lse,
11359     gen_aarch64_compare_and_swaphi_lse,
11360     gen_aarch64_compare_and_swapsi_lse,
11361     gen_aarch64_compare_and_swapdi_lse
11362   };
11363
11364   bval = operands[0];
11365   rval = operands[1];
11366   mem = operands[2];
11367   oldval = operands[3];
11368   newval = operands[4];
11369   is_weak = operands[5];
11370   mod_s = operands[6];
11371   mod_f = operands[7];
11372   mode = GET_MODE (mem);
11373   cmp_mode = mode;
11374
11375   /* Normally the succ memory model must be stronger than fail, but in the
11376      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11377      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
11378
11379   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11380       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11381     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11382
11383   switch (mode)
11384     {
11385     case QImode:
11386     case HImode:
11387       /* For short modes, we're going to perform the comparison in SImode,
11388          so do the zero-extension now.  */
11389       cmp_mode = SImode;
11390       rval = gen_reg_rtx (SImode);
11391       oldval = convert_modes (SImode, mode, oldval, true);
11392       /* Fall through.  */
11393
11394     case SImode:
11395     case DImode:
11396       /* Force the value into a register if needed.  */
11397       if (!aarch64_plus_operand (oldval, mode))
11398         oldval = force_reg (cmp_mode, oldval);
11399       break;
11400
11401     default:
11402       gcc_unreachable ();
11403     }
11404
11405   switch (mode)
11406     {
11407     case QImode: idx = 0; break;
11408     case HImode: idx = 1; break;
11409     case SImode: idx = 2; break;
11410     case DImode: idx = 3; break;
11411     default:
11412       gcc_unreachable ();
11413     }
11414   if (TARGET_LSE)
11415     gen = atomic_cas[idx];
11416   else
11417     gen = split_cas[idx];
11418
11419   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11420
11421   if (mode == QImode || mode == HImode)
11422     emit_move_insn (operands[1], gen_lowpart (mode, rval));
11423
11424   x = gen_rtx_REG (CCmode, CC_REGNUM);
11425   x = gen_rtx_EQ (SImode, x, const0_rtx);
11426   emit_insn (gen_rtx_SET (bval, x));
11427 }
11428
11429 /* Test whether the target supports using a atomic load-operate instruction.
11430    CODE is the operation and AFTER is TRUE if the data in memory after the
11431    operation should be returned and FALSE if the data before the operation
11432    should be returned.  Returns FALSE if the operation isn't supported by the
11433    architecture.  */
11434
11435 bool
11436 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11437 {
11438   if (!TARGET_LSE)
11439     return false;
11440
11441   switch (code)
11442     {
11443     case SET:
11444     case AND:
11445     case IOR:
11446     case XOR:
11447     case MINUS:
11448     case PLUS:
11449       return true;
11450     default:
11451       return false;
11452     }
11453 }
11454
11455 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11456    sequence implementing an atomic operation.  */
11457
11458 static void
11459 aarch64_emit_post_barrier (enum memmodel model)
11460 {
11461   const enum memmodel base_model = memmodel_base (model);
11462
11463   if (is_mm_sync (model)
11464       && (base_model == MEMMODEL_ACQUIRE
11465           || base_model == MEMMODEL_ACQ_REL
11466           || base_model == MEMMODEL_SEQ_CST))
11467     {
11468       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11469     }
11470 }
11471
11472 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
11473    for the data in memory.  EXPECTED is the value expected to be in memory.
11474    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
11475    is the memory ordering to use.  */
11476
11477 void
11478 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11479                         rtx expected, rtx desired,
11480                         rtx model)
11481 {
11482   rtx (*gen) (rtx, rtx, rtx, rtx);
11483   machine_mode mode;
11484
11485   mode = GET_MODE (mem);
11486
11487   switch (mode)
11488     {
11489     case QImode: gen = gen_aarch64_atomic_casqi; break;
11490     case HImode: gen = gen_aarch64_atomic_cashi; break;
11491     case SImode: gen = gen_aarch64_atomic_cassi; break;
11492     case DImode: gen = gen_aarch64_atomic_casdi; break;
11493     default:
11494       gcc_unreachable ();
11495     }
11496
11497   /* Move the expected value into the CAS destination register.  */
11498   emit_insn (gen_rtx_SET (rval, expected));
11499
11500   /* Emit the CAS.  */
11501   emit_insn (gen (rval, mem, desired, model));
11502
11503   /* Compare the expected value with the value loaded by the CAS, to establish
11504      whether the swap was made.  */
11505   aarch64_gen_compare_reg (EQ, rval, expected);
11506 }
11507
11508 /* Split a compare and swap pattern.  */
11509
11510 void
11511 aarch64_split_compare_and_swap (rtx operands[])
11512 {
11513   rtx rval, mem, oldval, newval, scratch;
11514   machine_mode mode;
11515   bool is_weak;
11516   rtx_code_label *label1, *label2;
11517   rtx x, cond;
11518   enum memmodel model;
11519   rtx model_rtx;
11520
11521   rval = operands[0];
11522   mem = operands[1];
11523   oldval = operands[2];
11524   newval = operands[3];
11525   is_weak = (operands[4] != const0_rtx);
11526   model_rtx = operands[5];
11527   scratch = operands[7];
11528   mode = GET_MODE (mem);
11529   model = memmodel_from_int (INTVAL (model_rtx));
11530
11531   label1 = NULL;
11532   if (!is_weak)
11533     {
11534       label1 = gen_label_rtx ();
11535       emit_label (label1);
11536     }
11537   label2 = gen_label_rtx ();
11538
11539   /* The initial load can be relaxed for a __sync operation since a final
11540      barrier will be emitted to stop code hoisting.  */
11541   if (is_mm_sync (model))
11542     aarch64_emit_load_exclusive (mode, rval, mem,
11543                                  GEN_INT (MEMMODEL_RELAXED));
11544   else
11545     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11546
11547   cond = aarch64_gen_compare_reg (NE, rval, oldval);
11548   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11549   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11550                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11551   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11552
11553   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11554
11555   if (!is_weak)
11556     {
11557       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11558       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11559                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11560       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11561     }
11562   else
11563     {
11564       cond = gen_rtx_REG (CCmode, CC_REGNUM);
11565       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11566       emit_insn (gen_rtx_SET (cond, x));
11567     }
11568
11569   emit_label (label2);
11570
11571   /* Emit any final barrier needed for a __sync operation.  */
11572   if (is_mm_sync (model))
11573     aarch64_emit_post_barrier (model);
11574 }
11575
11576 /* Emit a BIC instruction.  */
11577
11578 static void
11579 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11580 {
11581   rtx shift_rtx = GEN_INT (shift);
11582   rtx (*gen) (rtx, rtx, rtx, rtx);
11583
11584   switch (mode)
11585     {
11586     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11587     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11588     default:
11589       gcc_unreachable ();
11590     }
11591
11592   emit_insn (gen (dst, s2, shift_rtx, s1));
11593 }
11594
11595 /* Emit an atomic swap.  */
11596
11597 static void
11598 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
11599                           rtx mem, rtx model)
11600 {
11601   rtx (*gen) (rtx, rtx, rtx, rtx);
11602
11603   switch (mode)
11604     {
11605     case QImode: gen = gen_aarch64_atomic_swpqi; break;
11606     case HImode: gen = gen_aarch64_atomic_swphi; break;
11607     case SImode: gen = gen_aarch64_atomic_swpsi; break;
11608     case DImode: gen = gen_aarch64_atomic_swpdi; break;
11609     default:
11610       gcc_unreachable ();
11611     }
11612
11613   emit_insn (gen (dst, mem, value, model));
11614 }
11615
11616 /* Operations supported by aarch64_emit_atomic_load_op.  */
11617
11618 enum aarch64_atomic_load_op_code
11619 {
11620   AARCH64_LDOP_PLUS,    /* A + B  */
11621   AARCH64_LDOP_XOR,     /* A ^ B  */
11622   AARCH64_LDOP_OR,      /* A | B  */
11623   AARCH64_LDOP_BIC      /* A & ~B  */
11624 };
11625
11626 /* Emit an atomic load-operate.  */
11627
11628 static void
11629 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
11630                              machine_mode mode, rtx dst, rtx src,
11631                              rtx mem, rtx model)
11632 {
11633   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
11634   const aarch64_atomic_load_op_fn plus[] =
11635   {
11636     gen_aarch64_atomic_loadaddqi,
11637     gen_aarch64_atomic_loadaddhi,
11638     gen_aarch64_atomic_loadaddsi,
11639     gen_aarch64_atomic_loadadddi
11640   };
11641   const aarch64_atomic_load_op_fn eor[] =
11642   {
11643     gen_aarch64_atomic_loadeorqi,
11644     gen_aarch64_atomic_loadeorhi,
11645     gen_aarch64_atomic_loadeorsi,
11646     gen_aarch64_atomic_loadeordi
11647   };
11648   const aarch64_atomic_load_op_fn ior[] =
11649   {
11650     gen_aarch64_atomic_loadsetqi,
11651     gen_aarch64_atomic_loadsethi,
11652     gen_aarch64_atomic_loadsetsi,
11653     gen_aarch64_atomic_loadsetdi
11654   };
11655   const aarch64_atomic_load_op_fn bic[] =
11656   {
11657     gen_aarch64_atomic_loadclrqi,
11658     gen_aarch64_atomic_loadclrhi,
11659     gen_aarch64_atomic_loadclrsi,
11660     gen_aarch64_atomic_loadclrdi
11661   };
11662   aarch64_atomic_load_op_fn gen;
11663   int idx = 0;
11664
11665   switch (mode)
11666     {
11667     case QImode: idx = 0; break;
11668     case HImode: idx = 1; break;
11669     case SImode: idx = 2; break;
11670     case DImode: idx = 3; break;
11671     default:
11672       gcc_unreachable ();
11673     }
11674
11675   switch (code)
11676     {
11677     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
11678     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
11679     case AARCH64_LDOP_OR: gen = ior[idx]; break;
11680     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
11681     default:
11682       gcc_unreachable ();
11683     }
11684
11685   emit_insn (gen (dst, mem, src, model));
11686 }
11687
11688 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
11689    location to store the data read from memory.  OUT_RESULT is the location to
11690    store the result of the operation.  MEM is the memory location to read and
11691    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
11692    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
11693    be NULL.  */
11694
11695 void
11696 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
11697                          rtx mem, rtx value, rtx model_rtx)
11698 {
11699   machine_mode mode = GET_MODE (mem);
11700   machine_mode wmode = (mode == DImode ? DImode : SImode);
11701   const bool short_mode = (mode < SImode);
11702   aarch64_atomic_load_op_code ldop_code;
11703   rtx src;
11704   rtx x;
11705
11706   if (out_data)
11707     out_data = gen_lowpart (mode, out_data);
11708
11709   if (out_result)
11710     out_result = gen_lowpart (mode, out_result);
11711
11712   /* Make sure the value is in a register, putting it into a destination
11713      register if it needs to be manipulated.  */
11714   if (!register_operand (value, mode)
11715       || code == AND || code == MINUS)
11716     {
11717       src = out_result ? out_result : out_data;
11718       emit_move_insn (src, gen_lowpart (mode, value));
11719     }
11720   else
11721     src = value;
11722   gcc_assert (register_operand (src, mode));
11723
11724   /* Preprocess the data for the operation as necessary.  If the operation is
11725      a SET then emit a swap instruction and finish.  */
11726   switch (code)
11727     {
11728     case SET:
11729       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
11730       return;
11731
11732     case MINUS:
11733       /* Negate the value and treat it as a PLUS.  */
11734       {
11735         rtx neg_src;
11736
11737         /* Resize the value if necessary.  */
11738         if (short_mode)
11739           src = gen_lowpart (wmode, src);
11740
11741         neg_src = gen_rtx_NEG (wmode, src);
11742         emit_insn (gen_rtx_SET (src, neg_src));
11743
11744         if (short_mode)
11745           src = gen_lowpart (mode, src);
11746       }
11747       /* Fall-through.  */
11748     case PLUS:
11749       ldop_code = AARCH64_LDOP_PLUS;
11750       break;
11751
11752     case IOR:
11753       ldop_code = AARCH64_LDOP_OR;
11754       break;
11755
11756     case XOR:
11757       ldop_code = AARCH64_LDOP_XOR;
11758       break;
11759
11760     case AND:
11761       {
11762         rtx not_src;
11763
11764         /* Resize the value if necessary.  */
11765         if (short_mode)
11766           src = gen_lowpart (wmode, src);
11767
11768         not_src = gen_rtx_NOT (wmode, src);
11769         emit_insn (gen_rtx_SET (src, not_src));
11770
11771         if (short_mode)
11772           src = gen_lowpart (mode, src);
11773       }
11774       ldop_code = AARCH64_LDOP_BIC;
11775       break;
11776
11777     default:
11778       /* The operation can't be done with atomic instructions.  */
11779       gcc_unreachable ();
11780     }
11781
11782   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
11783
11784   /* If necessary, calculate the data in memory after the update by redoing the
11785      operation from values in registers.  */
11786   if (!out_result)
11787     return;
11788
11789   if (short_mode)
11790     {
11791       src = gen_lowpart (wmode, src);
11792       out_data = gen_lowpart (wmode, out_data);
11793       out_result = gen_lowpart (wmode, out_result);
11794     }
11795
11796   x = NULL_RTX;
11797
11798   switch (code)
11799     {
11800     case MINUS:
11801     case PLUS:
11802       x = gen_rtx_PLUS (wmode, out_data, src);
11803       break;
11804     case IOR:
11805       x = gen_rtx_IOR (wmode, out_data, src);
11806       break;
11807     case XOR:
11808       x = gen_rtx_XOR (wmode, out_data, src);
11809       break;
11810     case AND:
11811       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
11812       return;
11813     default:
11814       gcc_unreachable ();
11815     }
11816
11817   emit_set_insn (out_result, x);
11818
11819   return;
11820 }
11821
11822 /* Split an atomic operation.  */
11823
11824 void
11825 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
11826                          rtx value, rtx model_rtx, rtx cond)
11827 {
11828   machine_mode mode = GET_MODE (mem);
11829   machine_mode wmode = (mode == DImode ? DImode : SImode);
11830   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
11831   const bool is_sync = is_mm_sync (model);
11832   rtx_code_label *label;
11833   rtx x;
11834
11835   /* Split the atomic operation into a sequence.  */
11836   label = gen_label_rtx ();
11837   emit_label (label);
11838
11839   if (new_out)
11840     new_out = gen_lowpart (wmode, new_out);
11841   if (old_out)
11842     old_out = gen_lowpart (wmode, old_out);
11843   else
11844     old_out = new_out;
11845   value = simplify_gen_subreg (wmode, value, mode, 0);
11846
11847   /* The initial load can be relaxed for a __sync operation since a final
11848      barrier will be emitted to stop code hoisting.  */
11849  if (is_sync)
11850     aarch64_emit_load_exclusive (mode, old_out, mem,
11851                                  GEN_INT (MEMMODEL_RELAXED));
11852   else
11853     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
11854
11855   switch (code)
11856     {
11857     case SET:
11858       new_out = value;
11859       break;
11860
11861     case NOT:
11862       x = gen_rtx_AND (wmode, old_out, value);
11863       emit_insn (gen_rtx_SET (new_out, x));
11864       x = gen_rtx_NOT (wmode, new_out);
11865       emit_insn (gen_rtx_SET (new_out, x));
11866       break;
11867
11868     case MINUS:
11869       if (CONST_INT_P (value))
11870         {
11871           value = GEN_INT (-INTVAL (value));
11872           code = PLUS;
11873         }
11874       /* Fall through.  */
11875
11876     default:
11877       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
11878       emit_insn (gen_rtx_SET (new_out, x));
11879       break;
11880     }
11881
11882   aarch64_emit_store_exclusive (mode, cond, mem,
11883                                 gen_lowpart (mode, new_out), model_rtx);
11884
11885   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11886   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11887                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
11888   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11889
11890   /* Emit any final barrier needed for a __sync operation.  */
11891   if (is_sync)
11892     aarch64_emit_post_barrier (model);
11893 }
11894
11895 static void
11896 aarch64_init_libfuncs (void)
11897 {
11898    /* Half-precision float operations.  The compiler handles all operations
11899      with NULL libfuncs by converting to SFmode.  */
11900
11901   /* Conversions.  */
11902   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
11903   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
11904
11905   /* Arithmetic.  */
11906   set_optab_libfunc (add_optab, HFmode, NULL);
11907   set_optab_libfunc (sdiv_optab, HFmode, NULL);
11908   set_optab_libfunc (smul_optab, HFmode, NULL);
11909   set_optab_libfunc (neg_optab, HFmode, NULL);
11910   set_optab_libfunc (sub_optab, HFmode, NULL);
11911
11912   /* Comparisons.  */
11913   set_optab_libfunc (eq_optab, HFmode, NULL);
11914   set_optab_libfunc (ne_optab, HFmode, NULL);
11915   set_optab_libfunc (lt_optab, HFmode, NULL);
11916   set_optab_libfunc (le_optab, HFmode, NULL);
11917   set_optab_libfunc (ge_optab, HFmode, NULL);
11918   set_optab_libfunc (gt_optab, HFmode, NULL);
11919   set_optab_libfunc (unord_optab, HFmode, NULL);
11920 }
11921
11922 /* Target hook for c_mode_for_suffix.  */
11923 static machine_mode
11924 aarch64_c_mode_for_suffix (char suffix)
11925 {
11926   if (suffix == 'q')
11927     return TFmode;
11928
11929   return VOIDmode;
11930 }
11931
11932 /* We can only represent floating point constants which will fit in
11933    "quarter-precision" values.  These values are characterised by
11934    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
11935    by:
11936
11937    (-1)^s * (n/16) * 2^r
11938
11939    Where:
11940      's' is the sign bit.
11941      'n' is an integer in the range 16 <= n <= 31.
11942      'r' is an integer in the range -3 <= r <= 4.  */
11943
11944 /* Return true iff X can be represented by a quarter-precision
11945    floating point immediate operand X.  Note, we cannot represent 0.0.  */
11946 bool
11947 aarch64_float_const_representable_p (rtx x)
11948 {
11949   /* This represents our current view of how many bits
11950      make up the mantissa.  */
11951   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
11952   int exponent;
11953   unsigned HOST_WIDE_INT mantissa, mask;
11954   REAL_VALUE_TYPE r, m;
11955   bool fail;
11956
11957   if (!CONST_DOUBLE_P (x))
11958     return false;
11959
11960   /* We don't support HFmode constants yet.  */
11961   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
11962     return false;
11963
11964   r = *CONST_DOUBLE_REAL_VALUE (x);
11965
11966   /* We cannot represent infinities, NaNs or +/-zero.  We won't
11967      know if we have +zero until we analyse the mantissa, but we
11968      can reject the other invalid values.  */
11969   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
11970       || REAL_VALUE_MINUS_ZERO (r))
11971     return false;
11972
11973   /* Extract exponent.  */
11974   r = real_value_abs (&r);
11975   exponent = REAL_EXP (&r);
11976
11977   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
11978      highest (sign) bit, with a fixed binary point at bit point_pos.
11979      m1 holds the low part of the mantissa, m2 the high part.
11980      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
11981      bits for the mantissa, this can fail (low bits will be lost).  */
11982   real_ldexp (&m, &r, point_pos - exponent);
11983   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
11984
11985   /* If the low part of the mantissa has bits set we cannot represent
11986      the value.  */
11987   if (w.elt (0) != 0)
11988     return false;
11989   /* We have rejected the lower HOST_WIDE_INT, so update our
11990      understanding of how many bits lie in the mantissa and
11991      look only at the high HOST_WIDE_INT.  */
11992   mantissa = w.elt (1);
11993   point_pos -= HOST_BITS_PER_WIDE_INT;
11994
11995   /* We can only represent values with a mantissa of the form 1.xxxx.  */
11996   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
11997   if ((mantissa & mask) != 0)
11998     return false;
11999
12000   /* Having filtered unrepresentable values, we may now remove all
12001      but the highest 5 bits.  */
12002   mantissa >>= point_pos - 5;
12003
12004   /* We cannot represent the value 0.0, so reject it.  This is handled
12005      elsewhere.  */
12006   if (mantissa == 0)
12007     return false;
12008
12009   /* Then, as bit 4 is always set, we can mask it off, leaving
12010      the mantissa in the range [0, 15].  */
12011   mantissa &= ~(1 << 4);
12012   gcc_assert (mantissa <= 15);
12013
12014   /* GCC internally does not use IEEE754-like encoding (where normalized
12015      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12016      Our mantissa values are shifted 4 places to the left relative to
12017      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12018      by 5 places to correct for GCC's representation.  */
12019   exponent = 5 - exponent;
12020
12021   return (exponent >= 0 && exponent <= 7);
12022 }
12023
12024 char*
12025 aarch64_output_simd_mov_immediate (rtx const_vector,
12026                                    machine_mode mode,
12027                                    unsigned width)
12028 {
12029   bool is_valid;
12030   static char templ[40];
12031   const char *mnemonic;
12032   const char *shift_op;
12033   unsigned int lane_count = 0;
12034   char element_char;
12035
12036   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12037
12038   /* This will return true to show const_vector is legal for use as either
12039      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
12040      also update INFO to show how the immediate should be generated.  */
12041   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12042   gcc_assert (is_valid);
12043
12044   element_char = sizetochar (info.element_width);
12045   lane_count = width / info.element_width;
12046
12047   mode = GET_MODE_INNER (mode);
12048   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12049     {
12050       gcc_assert (info.shift == 0 && ! info.mvn);
12051       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12052          move immediate path.  */
12053       if (aarch64_float_const_zero_rtx_p (info.value))
12054         info.value = GEN_INT (0);
12055       else
12056         {
12057           const unsigned int buf_size = 20;
12058           char float_buf[buf_size] = {'\0'};
12059           real_to_decimal_for_mode (float_buf,
12060                                     CONST_DOUBLE_REAL_VALUE (info.value),
12061                                     buf_size, buf_size, 1, mode);
12062
12063           if (lane_count == 1)
12064             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12065           else
12066             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12067                       lane_count, element_char, float_buf);
12068           return templ;
12069         }
12070     }
12071
12072   mnemonic = info.mvn ? "mvni" : "movi";
12073   shift_op = info.msl ? "msl" : "lsl";
12074
12075   gcc_assert (CONST_INT_P (info.value));
12076   if (lane_count == 1)
12077     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12078               mnemonic, UINTVAL (info.value));
12079   else if (info.shift)
12080     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12081               ", %s %d", mnemonic, lane_count, element_char,
12082               UINTVAL (info.value), shift_op, info.shift);
12083   else
12084     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12085               mnemonic, lane_count, element_char, UINTVAL (info.value));
12086   return templ;
12087 }
12088
12089 char*
12090 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12091                                           machine_mode mode)
12092 {
12093   machine_mode vmode;
12094
12095   gcc_assert (!VECTOR_MODE_P (mode));
12096   vmode = aarch64_simd_container_mode (mode, 64);
12097   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12098   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12099 }
12100
12101 /* Split operands into moves from op[1] + op[2] into op[0].  */
12102
12103 void
12104 aarch64_split_combinev16qi (rtx operands[3])
12105 {
12106   unsigned int dest = REGNO (operands[0]);
12107   unsigned int src1 = REGNO (operands[1]);
12108   unsigned int src2 = REGNO (operands[2]);
12109   machine_mode halfmode = GET_MODE (operands[1]);
12110   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12111   rtx destlo, desthi;
12112
12113   gcc_assert (halfmode == V16QImode);
12114
12115   if (src1 == dest && src2 == dest + halfregs)
12116     {
12117       /* No-op move.  Can't split to nothing; emit something.  */
12118       emit_note (NOTE_INSN_DELETED);
12119       return;
12120     }
12121
12122   /* Preserve register attributes for variable tracking.  */
12123   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12124   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12125                                GET_MODE_SIZE (halfmode));
12126
12127   /* Special case of reversed high/low parts.  */
12128   if (reg_overlap_mentioned_p (operands[2], destlo)
12129       && reg_overlap_mentioned_p (operands[1], desthi))
12130     {
12131       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12132       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12133       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12134     }
12135   else if (!reg_overlap_mentioned_p (operands[2], destlo))
12136     {
12137       /* Try to avoid unnecessary moves if part of the result
12138          is in the right place already.  */
12139       if (src1 != dest)
12140         emit_move_insn (destlo, operands[1]);
12141       if (src2 != dest + halfregs)
12142         emit_move_insn (desthi, operands[2]);
12143     }
12144   else
12145     {
12146       if (src2 != dest + halfregs)
12147         emit_move_insn (desthi, operands[2]);
12148       if (src1 != dest)
12149         emit_move_insn (destlo, operands[1]);
12150     }
12151 }
12152
12153 /* vec_perm support.  */
12154
12155 #define MAX_VECT_LEN 16
12156
12157 struct expand_vec_perm_d
12158 {
12159   rtx target, op0, op1;
12160   unsigned char perm[MAX_VECT_LEN];
12161   machine_mode vmode;
12162   unsigned char nelt;
12163   bool one_vector_p;
12164   bool testing_p;
12165 };
12166
12167 /* Generate a variable permutation.  */
12168
12169 static void
12170 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12171 {
12172   machine_mode vmode = GET_MODE (target);
12173   bool one_vector_p = rtx_equal_p (op0, op1);
12174
12175   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12176   gcc_checking_assert (GET_MODE (op0) == vmode);
12177   gcc_checking_assert (GET_MODE (op1) == vmode);
12178   gcc_checking_assert (GET_MODE (sel) == vmode);
12179   gcc_checking_assert (TARGET_SIMD);
12180
12181   if (one_vector_p)
12182     {
12183       if (vmode == V8QImode)
12184         {
12185           /* Expand the argument to a V16QI mode by duplicating it.  */
12186           rtx pair = gen_reg_rtx (V16QImode);
12187           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12188           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12189         }
12190       else
12191         {
12192           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12193         }
12194     }
12195   else
12196     {
12197       rtx pair;
12198
12199       if (vmode == V8QImode)
12200         {
12201           pair = gen_reg_rtx (V16QImode);
12202           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12203           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12204         }
12205       else
12206         {
12207           pair = gen_reg_rtx (OImode);
12208           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12209           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12210         }
12211     }
12212 }
12213
12214 void
12215 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12216 {
12217   machine_mode vmode = GET_MODE (target);
12218   unsigned int nelt = GET_MODE_NUNITS (vmode);
12219   bool one_vector_p = rtx_equal_p (op0, op1);
12220   rtx mask;
12221
12222   /* The TBL instruction does not use a modulo index, so we must take care
12223      of that ourselves.  */
12224   mask = aarch64_simd_gen_const_vector_dup (vmode,
12225       one_vector_p ? nelt - 1 : 2 * nelt - 1);
12226   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12227
12228   /* For big-endian, we also need to reverse the index within the vector
12229      (but not which vector).  */
12230   if (BYTES_BIG_ENDIAN)
12231     {
12232       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
12233       if (!one_vector_p)
12234         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12235       sel = expand_simple_binop (vmode, XOR, sel, mask,
12236                                  NULL, 0, OPTAB_LIB_WIDEN);
12237     }
12238   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12239 }
12240
12241 /* Recognize patterns suitable for the TRN instructions.  */
12242 static bool
12243 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12244 {
12245   unsigned int i, odd, mask, nelt = d->nelt;
12246   rtx out, in0, in1, x;
12247   rtx (*gen) (rtx, rtx, rtx);
12248   machine_mode vmode = d->vmode;
12249
12250   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12251     return false;
12252
12253   /* Note that these are little-endian tests.
12254      We correct for big-endian later.  */
12255   if (d->perm[0] == 0)
12256     odd = 0;
12257   else if (d->perm[0] == 1)
12258     odd = 1;
12259   else
12260     return false;
12261   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12262
12263   for (i = 0; i < nelt; i += 2)
12264     {
12265       if (d->perm[i] != i + odd)
12266         return false;
12267       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12268         return false;
12269     }
12270
12271   /* Success!  */
12272   if (d->testing_p)
12273     return true;
12274
12275   in0 = d->op0;
12276   in1 = d->op1;
12277   if (BYTES_BIG_ENDIAN)
12278     {
12279       x = in0, in0 = in1, in1 = x;
12280       odd = !odd;
12281     }
12282   out = d->target;
12283
12284   if (odd)
12285     {
12286       switch (vmode)
12287         {
12288         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12289         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12290         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12291         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12292         case V4SImode: gen = gen_aarch64_trn2v4si; break;
12293         case V2SImode: gen = gen_aarch64_trn2v2si; break;
12294         case V2DImode: gen = gen_aarch64_trn2v2di; break;
12295         case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12296         case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12297         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12298         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12299         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12300         default:
12301           return false;
12302         }
12303     }
12304   else
12305     {
12306       switch (vmode)
12307         {
12308         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12309         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12310         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12311         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12312         case V4SImode: gen = gen_aarch64_trn1v4si; break;
12313         case V2SImode: gen = gen_aarch64_trn1v2si; break;
12314         case V2DImode: gen = gen_aarch64_trn1v2di; break;
12315         case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12316         case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12317         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12318         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12319         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12320         default:
12321           return false;
12322         }
12323     }
12324
12325   emit_insn (gen (out, in0, in1));
12326   return true;
12327 }
12328
12329 /* Recognize patterns suitable for the UZP instructions.  */
12330 static bool
12331 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12332 {
12333   unsigned int i, odd, mask, nelt = d->nelt;
12334   rtx out, in0, in1, x;
12335   rtx (*gen) (rtx, rtx, rtx);
12336   machine_mode vmode = d->vmode;
12337
12338   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12339     return false;
12340
12341   /* Note that these are little-endian tests.
12342      We correct for big-endian later.  */
12343   if (d->perm[0] == 0)
12344     odd = 0;
12345   else if (d->perm[0] == 1)
12346     odd = 1;
12347   else
12348     return false;
12349   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12350
12351   for (i = 0; i < nelt; i++)
12352     {
12353       unsigned elt = (i * 2 + odd) & mask;
12354       if (d->perm[i] != elt)
12355         return false;
12356     }
12357
12358   /* Success!  */
12359   if (d->testing_p)
12360     return true;
12361
12362   in0 = d->op0;
12363   in1 = d->op1;
12364   if (BYTES_BIG_ENDIAN)
12365     {
12366       x = in0, in0 = in1, in1 = x;
12367       odd = !odd;
12368     }
12369   out = d->target;
12370
12371   if (odd)
12372     {
12373       switch (vmode)
12374         {
12375         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12376         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12377         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12378         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12379         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12380         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12381         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12382         case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12383         case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12384         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12385         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12386         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12387         default:
12388           return false;
12389         }
12390     }
12391   else
12392     {
12393       switch (vmode)
12394         {
12395         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12396         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12397         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12398         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12399         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12400         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12401         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12402         case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12403         case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12404         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12405         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12406         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12407         default:
12408           return false;
12409         }
12410     }
12411
12412   emit_insn (gen (out, in0, in1));
12413   return true;
12414 }
12415
12416 /* Recognize patterns suitable for the ZIP instructions.  */
12417 static bool
12418 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12419 {
12420   unsigned int i, high, mask, nelt = d->nelt;
12421   rtx out, in0, in1, x;
12422   rtx (*gen) (rtx, rtx, rtx);
12423   machine_mode vmode = d->vmode;
12424
12425   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12426     return false;
12427
12428   /* Note that these are little-endian tests.
12429      We correct for big-endian later.  */
12430   high = nelt / 2;
12431   if (d->perm[0] == high)
12432     /* Do Nothing.  */
12433     ;
12434   else if (d->perm[0] == 0)
12435     high = 0;
12436   else
12437     return false;
12438   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12439
12440   for (i = 0; i < nelt / 2; i++)
12441     {
12442       unsigned elt = (i + high) & mask;
12443       if (d->perm[i * 2] != elt)
12444         return false;
12445       elt = (elt + nelt) & mask;
12446       if (d->perm[i * 2 + 1] != elt)
12447         return false;
12448     }
12449
12450   /* Success!  */
12451   if (d->testing_p)
12452     return true;
12453
12454   in0 = d->op0;
12455   in1 = d->op1;
12456   if (BYTES_BIG_ENDIAN)
12457     {
12458       x = in0, in0 = in1, in1 = x;
12459       high = !high;
12460     }
12461   out = d->target;
12462
12463   if (high)
12464     {
12465       switch (vmode)
12466         {
12467         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12468         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12469         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12470         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12471         case V4SImode: gen = gen_aarch64_zip2v4si; break;
12472         case V2SImode: gen = gen_aarch64_zip2v2si; break;
12473         case V2DImode: gen = gen_aarch64_zip2v2di; break;
12474         case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12475         case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12476         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12477         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12478         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12479         default:
12480           return false;
12481         }
12482     }
12483   else
12484     {
12485       switch (vmode)
12486         {
12487         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12488         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12489         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12490         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12491         case V4SImode: gen = gen_aarch64_zip1v4si; break;
12492         case V2SImode: gen = gen_aarch64_zip1v2si; break;
12493         case V2DImode: gen = gen_aarch64_zip1v2di; break;
12494         case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
12495         case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
12496         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12497         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12498         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12499         default:
12500           return false;
12501         }
12502     }
12503
12504   emit_insn (gen (out, in0, in1));
12505   return true;
12506 }
12507
12508 /* Recognize patterns for the EXT insn.  */
12509
12510 static bool
12511 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12512 {
12513   unsigned int i, nelt = d->nelt;
12514   rtx (*gen) (rtx, rtx, rtx, rtx);
12515   rtx offset;
12516
12517   unsigned int location = d->perm[0]; /* Always < nelt.  */
12518
12519   /* Check if the extracted indices are increasing by one.  */
12520   for (i = 1; i < nelt; i++)
12521     {
12522       unsigned int required = location + i;
12523       if (d->one_vector_p)
12524         {
12525           /* We'll pass the same vector in twice, so allow indices to wrap.  */
12526           required &= (nelt - 1);
12527         }
12528       if (d->perm[i] != required)
12529         return false;
12530     }
12531
12532   switch (d->vmode)
12533     {
12534     case V16QImode: gen = gen_aarch64_extv16qi; break;
12535     case V8QImode: gen = gen_aarch64_extv8qi; break;
12536     case V4HImode: gen = gen_aarch64_extv4hi; break;
12537     case V8HImode: gen = gen_aarch64_extv8hi; break;
12538     case V2SImode: gen = gen_aarch64_extv2si; break;
12539     case V4SImode: gen = gen_aarch64_extv4si; break;
12540     case V4HFmode: gen = gen_aarch64_extv4hf; break;
12541     case V8HFmode: gen = gen_aarch64_extv8hf; break;
12542     case V2SFmode: gen = gen_aarch64_extv2sf; break;
12543     case V4SFmode: gen = gen_aarch64_extv4sf; break;
12544     case V2DImode: gen = gen_aarch64_extv2di; break;
12545     case V2DFmode: gen = gen_aarch64_extv2df; break;
12546     default:
12547       return false;
12548     }
12549
12550   /* Success! */
12551   if (d->testing_p)
12552     return true;
12553
12554   /* The case where (location == 0) is a no-op for both big- and little-endian,
12555      and is removed by the mid-end at optimization levels -O1 and higher.  */
12556
12557   if (BYTES_BIG_ENDIAN && (location != 0))
12558     {
12559       /* After setup, we want the high elements of the first vector (stored
12560          at the LSB end of the register), and the low elements of the second
12561          vector (stored at the MSB end of the register). So swap.  */
12562       std::swap (d->op0, d->op1);
12563       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
12564       location = nelt - location;
12565     }
12566
12567   offset = GEN_INT (location);
12568   emit_insn (gen (d->target, d->op0, d->op1, offset));
12569   return true;
12570 }
12571
12572 /* Recognize patterns for the REV insns.  */
12573
12574 static bool
12575 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12576 {
12577   unsigned int i, j, diff, nelt = d->nelt;
12578   rtx (*gen) (rtx, rtx);
12579
12580   if (!d->one_vector_p)
12581     return false;
12582
12583   diff = d->perm[0];
12584   switch (diff)
12585     {
12586     case 7:
12587       switch (d->vmode)
12588         {
12589         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12590         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
12591         default:
12592           return false;
12593         }
12594       break;
12595     case 3:
12596       switch (d->vmode)
12597         {
12598         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
12599         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
12600         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
12601         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
12602         default:
12603           return false;
12604         }
12605       break;
12606     case 1:
12607       switch (d->vmode)
12608         {
12609         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
12610         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
12611         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
12612         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
12613         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
12614         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
12615         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
12616         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
12617         case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
12618         case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
12619         default:
12620           return false;
12621         }
12622       break;
12623     default:
12624       return false;
12625     }
12626
12627   for (i = 0; i < nelt ; i += diff + 1)
12628     for (j = 0; j <= diff; j += 1)
12629       {
12630         /* This is guaranteed to be true as the value of diff
12631            is 7, 3, 1 and we should have enough elements in the
12632            queue to generate this.  Getting a vector mask with a
12633            value of diff other than these values implies that
12634            something is wrong by the time we get here.  */
12635         gcc_assert (i + j < nelt);
12636         if (d->perm[i + j] != i + diff - j)
12637           return false;
12638       }
12639
12640   /* Success! */
12641   if (d->testing_p)
12642     return true;
12643
12644   emit_insn (gen (d->target, d->op0));
12645   return true;
12646 }
12647
12648 static bool
12649 aarch64_evpc_dup (struct expand_vec_perm_d *d)
12650 {
12651   rtx (*gen) (rtx, rtx, rtx);
12652   rtx out = d->target;
12653   rtx in0;
12654   machine_mode vmode = d->vmode;
12655   unsigned int i, elt, nelt = d->nelt;
12656   rtx lane;
12657
12658   elt = d->perm[0];
12659   for (i = 1; i < nelt; i++)
12660     {
12661       if (elt != d->perm[i])
12662         return false;
12663     }
12664
12665   /* The generic preparation in aarch64_expand_vec_perm_const_1
12666      swaps the operand order and the permute indices if it finds
12667      d->perm[0] to be in the second operand.  Thus, we can always
12668      use d->op0 and need not do any extra arithmetic to get the
12669      correct lane number.  */
12670   in0 = d->op0;
12671   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
12672
12673   switch (vmode)
12674     {
12675     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
12676     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
12677     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
12678     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
12679     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
12680     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
12681     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
12682     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
12683     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
12684     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
12685     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
12686     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
12687     default:
12688       return false;
12689     }
12690
12691   emit_insn (gen (out, in0, lane));
12692   return true;
12693 }
12694
12695 static bool
12696 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
12697 {
12698   rtx rperm[MAX_VECT_LEN], sel;
12699   machine_mode vmode = d->vmode;
12700   unsigned int i, nelt = d->nelt;
12701
12702   if (d->testing_p)
12703     return true;
12704
12705   /* Generic code will try constant permutation twice.  Once with the
12706      original mode and again with the elements lowered to QImode.
12707      So wait and don't do the selector expansion ourselves.  */
12708   if (vmode != V8QImode && vmode != V16QImode)
12709     return false;
12710
12711   for (i = 0; i < nelt; ++i)
12712     {
12713       int nunits = GET_MODE_NUNITS (vmode);
12714
12715       /* If big-endian and two vectors we end up with a weird mixed-endian
12716          mode on NEON.  Reverse the index within each word but not the word
12717          itself.  */
12718       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
12719                                            : d->perm[i]);
12720     }
12721   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
12722   sel = force_reg (vmode, sel);
12723
12724   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
12725   return true;
12726 }
12727
12728 static bool
12729 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
12730 {
12731   /* The pattern matching functions above are written to look for a small
12732      number to begin the sequence (0, 1, N/2).  If we begin with an index
12733      from the second operand, we can swap the operands.  */
12734   if (d->perm[0] >= d->nelt)
12735     {
12736       unsigned i, nelt = d->nelt;
12737
12738       gcc_assert (nelt == (nelt & -nelt));
12739       for (i = 0; i < nelt; ++i)
12740         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
12741
12742       std::swap (d->op0, d->op1);
12743     }
12744
12745   if (TARGET_SIMD)
12746     {
12747       if (aarch64_evpc_rev (d))
12748         return true;
12749       else if (aarch64_evpc_ext (d))
12750         return true;
12751       else if (aarch64_evpc_dup (d))
12752         return true;
12753       else if (aarch64_evpc_zip (d))
12754         return true;
12755       else if (aarch64_evpc_uzp (d))
12756         return true;
12757       else if (aarch64_evpc_trn (d))
12758         return true;
12759       return aarch64_evpc_tbl (d);
12760     }
12761   return false;
12762 }
12763
12764 /* Expand a vec_perm_const pattern.  */
12765
12766 bool
12767 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
12768 {
12769   struct expand_vec_perm_d d;
12770   int i, nelt, which;
12771
12772   d.target = target;
12773   d.op0 = op0;
12774   d.op1 = op1;
12775
12776   d.vmode = GET_MODE (target);
12777   gcc_assert (VECTOR_MODE_P (d.vmode));
12778   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12779   d.testing_p = false;
12780
12781   for (i = which = 0; i < nelt; ++i)
12782     {
12783       rtx e = XVECEXP (sel, 0, i);
12784       int ei = INTVAL (e) & (2 * nelt - 1);
12785       which |= (ei < nelt ? 1 : 2);
12786       d.perm[i] = ei;
12787     }
12788
12789   switch (which)
12790     {
12791     default:
12792       gcc_unreachable ();
12793
12794     case 3:
12795       d.one_vector_p = false;
12796       if (!rtx_equal_p (op0, op1))
12797         break;
12798
12799       /* The elements of PERM do not suggest that only the first operand
12800          is used, but both operands are identical.  Allow easier matching
12801          of the permutation by folding the permutation into the single
12802          input vector.  */
12803       /* Fall Through.  */
12804     case 2:
12805       for (i = 0; i < nelt; ++i)
12806         d.perm[i] &= nelt - 1;
12807       d.op0 = op1;
12808       d.one_vector_p = true;
12809       break;
12810
12811     case 1:
12812       d.op1 = op0;
12813       d.one_vector_p = true;
12814       break;
12815     }
12816
12817   return aarch64_expand_vec_perm_const_1 (&d);
12818 }
12819
12820 static bool
12821 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
12822                                      const unsigned char *sel)
12823 {
12824   struct expand_vec_perm_d d;
12825   unsigned int i, nelt, which;
12826   bool ret;
12827
12828   d.vmode = vmode;
12829   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12830   d.testing_p = true;
12831   memcpy (d.perm, sel, nelt);
12832
12833   /* Calculate whether all elements are in one vector.  */
12834   for (i = which = 0; i < nelt; ++i)
12835     {
12836       unsigned char e = d.perm[i];
12837       gcc_assert (e < 2 * nelt);
12838       which |= (e < nelt ? 1 : 2);
12839     }
12840
12841   /* If all elements are from the second vector, reindex as if from the
12842      first vector.  */
12843   if (which == 2)
12844     for (i = 0; i < nelt; ++i)
12845       d.perm[i] -= nelt;
12846
12847   /* Check whether the mask can be applied to a single vector.  */
12848   d.one_vector_p = (which != 3);
12849
12850   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
12851   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
12852   if (!d.one_vector_p)
12853     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
12854
12855   start_sequence ();
12856   ret = aarch64_expand_vec_perm_const_1 (&d);
12857   end_sequence ();
12858
12859   return ret;
12860 }
12861
12862 rtx
12863 aarch64_reverse_mask (enum machine_mode mode)
12864 {
12865   /* We have to reverse each vector because we dont have
12866      a permuted load that can reverse-load according to ABI rules.  */
12867   rtx mask;
12868   rtvec v = rtvec_alloc (16);
12869   int i, j;
12870   int nunits = GET_MODE_NUNITS (mode);
12871   int usize = GET_MODE_UNIT_SIZE (mode);
12872
12873   gcc_assert (BYTES_BIG_ENDIAN);
12874   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
12875
12876   for (i = 0; i < nunits; i++)
12877     for (j = 0; j < usize; j++)
12878       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
12879   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
12880   return force_reg (V16QImode, mask);
12881 }
12882
12883 /* Implement MODES_TIEABLE_P.  In principle we should always return true.
12884    However due to issues with register allocation it is preferable to avoid
12885    tieing integer scalar and FP scalar modes.  Executing integer operations
12886    in general registers is better than treating them as scalar vector
12887    operations.  This reduces latency and avoids redundant int<->FP moves.
12888    So tie modes if they are either the same class, or vector modes with
12889    other vector modes, vector structs or any scalar mode.
12890 */
12891
12892 bool
12893 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
12894 {
12895   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
12896     return true;
12897
12898   /* We specifically want to allow elements of "structure" modes to
12899      be tieable to the structure.  This more general condition allows
12900      other rarer situations too.  */
12901   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
12902     return true;
12903
12904   /* Also allow any scalar modes with vectors.  */
12905   if (aarch64_vector_mode_supported_p (mode1)
12906       || aarch64_vector_mode_supported_p (mode2))
12907     return true;
12908
12909   return false;
12910 }
12911
12912 /* Return a new RTX holding the result of moving POINTER forward by
12913    AMOUNT bytes.  */
12914
12915 static rtx
12916 aarch64_move_pointer (rtx pointer, int amount)
12917 {
12918   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
12919
12920   return adjust_automodify_address (pointer, GET_MODE (pointer),
12921                                     next, amount);
12922 }
12923
12924 /* Return a new RTX holding the result of moving POINTER forward by the
12925    size of the mode it points to.  */
12926
12927 static rtx
12928 aarch64_progress_pointer (rtx pointer)
12929 {
12930   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
12931
12932   return aarch64_move_pointer (pointer, amount);
12933 }
12934
12935 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
12936    MODE bytes.  */
12937
12938 static void
12939 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
12940                                               machine_mode mode)
12941 {
12942   rtx reg = gen_reg_rtx (mode);
12943
12944   /* "Cast" the pointers to the correct mode.  */
12945   *src = adjust_address (*src, mode, 0);
12946   *dst = adjust_address (*dst, mode, 0);
12947   /* Emit the memcpy.  */
12948   emit_move_insn (reg, *src);
12949   emit_move_insn (*dst, reg);
12950   /* Move the pointers forward.  */
12951   *src = aarch64_progress_pointer (*src);
12952   *dst = aarch64_progress_pointer (*dst);
12953 }
12954
12955 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
12956    we succeed, otherwise return false.  */
12957
12958 bool
12959 aarch64_expand_movmem (rtx *operands)
12960 {
12961   unsigned int n;
12962   rtx dst = operands[0];
12963   rtx src = operands[1];
12964   rtx base;
12965   bool speed_p = !optimize_function_for_size_p (cfun);
12966
12967   /* When optimizing for size, give a better estimate of the length of a
12968      memcpy call, but use the default otherwise.  */
12969   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
12970
12971   /* We can't do anything smart if the amount to copy is not constant.  */
12972   if (!CONST_INT_P (operands[2]))
12973     return false;
12974
12975   n = UINTVAL (operands[2]);
12976
12977   /* Try to keep the number of instructions low.  For cases below 16 bytes we
12978      need to make at most two moves.  For cases above 16 bytes it will be one
12979      move for each 16 byte chunk, then at most two additional moves.  */
12980   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
12981     return false;
12982
12983   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
12984   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
12985
12986   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
12987   src = adjust_automodify_address (src, VOIDmode, base, 0);
12988
12989   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
12990      1-byte chunk.  */
12991   if (n < 4)
12992     {
12993       if (n >= 2)
12994         {
12995           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12996           n -= 2;
12997         }
12998
12999       if (n == 1)
13000         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13001
13002       return true;
13003     }
13004
13005   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13006      4-byte chunk, partially overlapping with the previously copied chunk.  */
13007   if (n < 8)
13008     {
13009       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13010       n -= 4;
13011       if (n > 0)
13012         {
13013           int move = n - 4;
13014
13015           src = aarch64_move_pointer (src, move);
13016           dst = aarch64_move_pointer (dst, move);
13017           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13018         }
13019       return true;
13020     }
13021
13022   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13023      them, then (if applicable) an 8-byte chunk.  */
13024   while (n >= 8)
13025     {
13026       if (n / 16)
13027         {
13028           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13029           n -= 16;
13030         }
13031       else
13032         {
13033           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13034           n -= 8;
13035         }
13036     }
13037
13038   /* Finish the final bytes of the copy.  We can always do this in one
13039      instruction.  We either copy the exact amount we need, or partially
13040      overlap with the previous chunk we copied and copy 8-bytes.  */
13041   if (n == 0)
13042     return true;
13043   else if (n == 1)
13044     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13045   else if (n == 2)
13046     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13047   else if (n == 4)
13048     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13049   else
13050     {
13051       if (n == 3)
13052         {
13053           src = aarch64_move_pointer (src, -1);
13054           dst = aarch64_move_pointer (dst, -1);
13055           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13056         }
13057       else
13058         {
13059           int move = n - 8;
13060
13061           src = aarch64_move_pointer (src, move);
13062           dst = aarch64_move_pointer (dst, move);
13063           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13064         }
13065     }
13066
13067   return true;
13068 }
13069
13070 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
13071
13072 static unsigned HOST_WIDE_INT
13073 aarch64_asan_shadow_offset (void)
13074 {
13075   return (HOST_WIDE_INT_1 << 36);
13076 }
13077
13078 static bool
13079 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13080                                         unsigned int align,
13081                                         enum by_pieces_operation op,
13082                                         bool speed_p)
13083 {
13084   /* STORE_BY_PIECES can be used when copying a constant string, but
13085      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13086      For now we always fail this and let the move_by_pieces code copy
13087      the string from read-only memory.  */
13088   if (op == STORE_BY_PIECES)
13089     return false;
13090
13091   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13092 }
13093
13094 static rtx
13095 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
13096                         int code, tree treeop0, tree treeop1)
13097 {
13098   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13099   rtx op0, op1;
13100   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13101   insn_code icode;
13102   struct expand_operand ops[4];
13103
13104   start_sequence ();
13105   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13106
13107   op_mode = GET_MODE (op0);
13108   if (op_mode == VOIDmode)
13109     op_mode = GET_MODE (op1);
13110
13111   switch (op_mode)
13112     {
13113     case QImode:
13114     case HImode:
13115     case SImode:
13116       cmp_mode = SImode;
13117       icode = CODE_FOR_cmpsi;
13118       break;
13119
13120     case DImode:
13121       cmp_mode = DImode;
13122       icode = CODE_FOR_cmpdi;
13123       break;
13124
13125     case SFmode:
13126       cmp_mode = SFmode;
13127       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13128       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13129       break;
13130
13131     case DFmode:
13132       cmp_mode = DFmode;
13133       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13134       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13135       break;
13136
13137     default:
13138       end_sequence ();
13139       return NULL_RTX;
13140     }
13141
13142   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13143   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13144   if (!op0 || !op1)
13145     {
13146       end_sequence ();
13147       return NULL_RTX;
13148     }
13149   *prep_seq = get_insns ();
13150   end_sequence ();
13151
13152   create_fixed_operand (&ops[0], op0);
13153   create_fixed_operand (&ops[1], op1);
13154
13155   start_sequence ();
13156   if (!maybe_expand_insn (icode, 2, ops))
13157     {
13158       end_sequence ();
13159       return NULL_RTX;
13160     }
13161   *gen_seq = get_insns ();
13162   end_sequence ();
13163
13164   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13165                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13166 }
13167
13168 static rtx
13169 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
13170                        tree treeop0, tree treeop1, int bit_code)
13171 {
13172   rtx op0, op1, target;
13173   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13174   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13175   insn_code icode;
13176   struct expand_operand ops[6];
13177   int aarch64_cond;
13178
13179   push_to_sequence ((rtx_insn*) *prep_seq);
13180   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13181
13182   op_mode = GET_MODE (op0);
13183   if (op_mode == VOIDmode)
13184     op_mode = GET_MODE (op1);
13185
13186   switch (op_mode)
13187     {
13188     case QImode:
13189     case HImode:
13190     case SImode:
13191       cmp_mode = SImode;
13192       icode = CODE_FOR_ccmpsi;
13193       break;
13194
13195     case DImode:
13196       cmp_mode = DImode;
13197       icode = CODE_FOR_ccmpdi;
13198       break;
13199
13200     case SFmode:
13201       cmp_mode = SFmode;
13202       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13203       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13204       break;
13205
13206     case DFmode:
13207       cmp_mode = DFmode;
13208       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13209       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13210       break;
13211
13212     default:
13213       end_sequence ();
13214       return NULL_RTX;
13215     }
13216
13217   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13218   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13219   if (!op0 || !op1)
13220     {
13221       end_sequence ();
13222       return NULL_RTX;
13223     }
13224   *prep_seq = get_insns ();
13225   end_sequence ();
13226
13227   target = gen_rtx_REG (cc_mode, CC_REGNUM);
13228   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13229
13230   if (bit_code != AND)
13231     {
13232       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13233                                                 GET_MODE (XEXP (prev, 0))),
13234                              VOIDmode, XEXP (prev, 0), const0_rtx);
13235       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13236     }
13237
13238   create_fixed_operand (&ops[0], XEXP (prev, 0));
13239   create_fixed_operand (&ops[1], target);
13240   create_fixed_operand (&ops[2], op0);
13241   create_fixed_operand (&ops[3], op1);
13242   create_fixed_operand (&ops[4], prev);
13243   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13244
13245   push_to_sequence ((rtx_insn*) *gen_seq);
13246   if (!maybe_expand_insn (icode, 6, ops))
13247     {
13248       end_sequence ();
13249       return NULL_RTX;
13250     }
13251
13252   *gen_seq = get_insns ();
13253   end_sequence ();
13254
13255   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13256 }
13257
13258 #undef TARGET_GEN_CCMP_FIRST
13259 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13260
13261 #undef TARGET_GEN_CCMP_NEXT
13262 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13263
13264 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
13265    instruction fusion of some sort.  */
13266
13267 static bool
13268 aarch64_macro_fusion_p (void)
13269 {
13270   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13271 }
13272
13273
13274 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
13275    should be kept together during scheduling.  */
13276
13277 static bool
13278 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13279 {
13280   rtx set_dest;
13281   rtx prev_set = single_set (prev);
13282   rtx curr_set = single_set (curr);
13283   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
13284   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13285
13286   if (!aarch64_macro_fusion_p ())
13287     return false;
13288
13289   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
13290     {
13291       /* We are trying to match:
13292          prev (mov)  == (set (reg r0) (const_int imm16))
13293          curr (movk) == (set (zero_extract (reg r0)
13294                                            (const_int 16)
13295                                            (const_int 16))
13296                              (const_int imm16_1))  */
13297
13298       set_dest = SET_DEST (curr_set);
13299
13300       if (GET_CODE (set_dest) == ZERO_EXTRACT
13301           && CONST_INT_P (SET_SRC (curr_set))
13302           && CONST_INT_P (SET_SRC (prev_set))
13303           && CONST_INT_P (XEXP (set_dest, 2))
13304           && INTVAL (XEXP (set_dest, 2)) == 16
13305           && REG_P (XEXP (set_dest, 0))
13306           && REG_P (SET_DEST (prev_set))
13307           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13308         {
13309           return true;
13310         }
13311     }
13312
13313   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
13314     {
13315
13316       /*  We're trying to match:
13317           prev (adrp) == (set (reg r1)
13318                               (high (symbol_ref ("SYM"))))
13319           curr (add) == (set (reg r0)
13320                              (lo_sum (reg r1)
13321                                      (symbol_ref ("SYM"))))
13322           Note that r0 need not necessarily be the same as r1, especially
13323           during pre-regalloc scheduling.  */
13324
13325       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13326           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13327         {
13328           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13329               && REG_P (XEXP (SET_SRC (curr_set), 0))
13330               && REGNO (XEXP (SET_SRC (curr_set), 0))
13331                  == REGNO (SET_DEST (prev_set))
13332               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13333                               XEXP (SET_SRC (curr_set), 1)))
13334             return true;
13335         }
13336     }
13337
13338   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
13339     {
13340
13341       /* We're trying to match:
13342          prev (movk) == (set (zero_extract (reg r0)
13343                                            (const_int 16)
13344                                            (const_int 32))
13345                              (const_int imm16_1))
13346          curr (movk) == (set (zero_extract (reg r0)
13347                                            (const_int 16)
13348                                            (const_int 48))
13349                              (const_int imm16_2))  */
13350
13351       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13352           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13353           && REG_P (XEXP (SET_DEST (prev_set), 0))
13354           && REG_P (XEXP (SET_DEST (curr_set), 0))
13355           && REGNO (XEXP (SET_DEST (prev_set), 0))
13356              == REGNO (XEXP (SET_DEST (curr_set), 0))
13357           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13358           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13359           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13360           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13361           && CONST_INT_P (SET_SRC (prev_set))
13362           && CONST_INT_P (SET_SRC (curr_set)))
13363         return true;
13364
13365     }
13366   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
13367     {
13368       /* We're trying to match:
13369           prev (adrp) == (set (reg r0)
13370                               (high (symbol_ref ("SYM"))))
13371           curr (ldr) == (set (reg r1)
13372                              (mem (lo_sum (reg r0)
13373                                              (symbol_ref ("SYM")))))
13374                  or
13375           curr (ldr) == (set (reg r1)
13376                              (zero_extend (mem
13377                                            (lo_sum (reg r0)
13378                                                    (symbol_ref ("SYM"))))))  */
13379       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13380           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13381         {
13382           rtx curr_src = SET_SRC (curr_set);
13383
13384           if (GET_CODE (curr_src) == ZERO_EXTEND)
13385             curr_src = XEXP (curr_src, 0);
13386
13387           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13388               && REG_P (XEXP (XEXP (curr_src, 0), 0))
13389               && REGNO (XEXP (XEXP (curr_src, 0), 0))
13390                  == REGNO (SET_DEST (prev_set))
13391               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13392                               XEXP (SET_SRC (prev_set), 0)))
13393               return true;
13394         }
13395     }
13396
13397   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
13398        && aarch_crypto_can_dual_issue (prev, curr))
13399     return true;
13400
13401   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
13402       && any_condjump_p (curr))
13403     {
13404       enum attr_type prev_type = get_attr_type (prev);
13405
13406       /* FIXME: this misses some which is considered simple arthematic
13407          instructions for ThunderX.  Simple shifts are missed here.  */
13408       if (prev_type == TYPE_ALUS_SREG
13409           || prev_type == TYPE_ALUS_IMM
13410           || prev_type == TYPE_LOGICS_REG
13411           || prev_type == TYPE_LOGICS_IMM)
13412         return true;
13413     }
13414
13415   return false;
13416 }
13417
13418 /* Return true iff the instruction fusion described by OP is enabled.  */
13419
13420 bool
13421 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13422 {
13423   return (aarch64_tune_params.fusible_ops & op) != 0;
13424 }
13425
13426 /* If MEM is in the form of [base+offset], extract the two parts
13427    of address and set to BASE and OFFSET, otherwise return false
13428    after clearing BASE and OFFSET.  */
13429
13430 bool
13431 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13432 {
13433   rtx addr;
13434
13435   gcc_assert (MEM_P (mem));
13436
13437   addr = XEXP (mem, 0);
13438
13439   if (REG_P (addr))
13440     {
13441       *base = addr;
13442       *offset = const0_rtx;
13443       return true;
13444     }
13445
13446   if (GET_CODE (addr) == PLUS
13447       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13448     {
13449       *base = XEXP (addr, 0);
13450       *offset = XEXP (addr, 1);
13451       return true;
13452     }
13453
13454   *base = NULL_RTX;
13455   *offset = NULL_RTX;
13456
13457   return false;
13458 }
13459
13460 /* Types for scheduling fusion.  */
13461 enum sched_fusion_type
13462 {
13463   SCHED_FUSION_NONE = 0,
13464   SCHED_FUSION_LD_SIGN_EXTEND,
13465   SCHED_FUSION_LD_ZERO_EXTEND,
13466   SCHED_FUSION_LD,
13467   SCHED_FUSION_ST,
13468   SCHED_FUSION_NUM
13469 };
13470
13471 /* If INSN is a load or store of address in the form of [base+offset],
13472    extract the two parts and set to BASE and OFFSET.  Return scheduling
13473    fusion type this INSN is.  */
13474
13475 static enum sched_fusion_type
13476 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13477 {
13478   rtx x, dest, src;
13479   enum sched_fusion_type fusion = SCHED_FUSION_LD;
13480
13481   gcc_assert (INSN_P (insn));
13482   x = PATTERN (insn);
13483   if (GET_CODE (x) != SET)
13484     return SCHED_FUSION_NONE;
13485
13486   src = SET_SRC (x);
13487   dest = SET_DEST (x);
13488
13489   machine_mode dest_mode = GET_MODE (dest);
13490
13491   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13492     return SCHED_FUSION_NONE;
13493
13494   if (GET_CODE (src) == SIGN_EXTEND)
13495     {
13496       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
13497       src = XEXP (src, 0);
13498       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13499         return SCHED_FUSION_NONE;
13500     }
13501   else if (GET_CODE (src) == ZERO_EXTEND)
13502     {
13503       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13504       src = XEXP (src, 0);
13505       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13506         return SCHED_FUSION_NONE;
13507     }
13508
13509   if (GET_CODE (src) == MEM && REG_P (dest))
13510     extract_base_offset_in_addr (src, base, offset);
13511   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13512     {
13513       fusion = SCHED_FUSION_ST;
13514       extract_base_offset_in_addr (dest, base, offset);
13515     }
13516   else
13517     return SCHED_FUSION_NONE;
13518
13519   if (*base == NULL_RTX || *offset == NULL_RTX)
13520     fusion = SCHED_FUSION_NONE;
13521
13522   return fusion;
13523 }
13524
13525 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13526
13527    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13528    and PRI are only calculated for these instructions.  For other instruction,
13529    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
13530    type instruction fusion can be added by returning different priorities.
13531
13532    It's important that irrelevant instructions get the largest FUSION_PRI.  */
13533
13534 static void
13535 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13536                                int *fusion_pri, int *pri)
13537 {
13538   int tmp, off_val;
13539   rtx base, offset;
13540   enum sched_fusion_type fusion;
13541
13542   gcc_assert (INSN_P (insn));
13543
13544   tmp = max_pri - 1;
13545   fusion = fusion_load_store (insn, &base, &offset);
13546   if (fusion == SCHED_FUSION_NONE)
13547     {
13548       *pri = tmp;
13549       *fusion_pri = tmp;
13550       return;
13551     }
13552
13553   /* Set FUSION_PRI according to fusion type and base register.  */
13554   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
13555
13556   /* Calculate PRI.  */
13557   tmp /= 2;
13558
13559   /* INSN with smaller offset goes first.  */
13560   off_val = (int)(INTVAL (offset));
13561   if (off_val >= 0)
13562     tmp -= (off_val & 0xfffff);
13563   else
13564     tmp += ((- off_val) & 0xfffff);
13565
13566   *pri = tmp;
13567   return;
13568 }
13569
13570 /* Given OPERANDS of consecutive load/store, check if we can merge
13571    them into ldp/stp.  LOAD is true if they are load instructions.
13572    MODE is the mode of memory operands.  */
13573
13574 bool
13575 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
13576                                 enum machine_mode mode)
13577 {
13578   HOST_WIDE_INT offval_1, offval_2, msize;
13579   enum reg_class rclass_1, rclass_2;
13580   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
13581
13582   if (load)
13583     {
13584       mem_1 = operands[1];
13585       mem_2 = operands[3];
13586       reg_1 = operands[0];
13587       reg_2 = operands[2];
13588       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
13589       if (REGNO (reg_1) == REGNO (reg_2))
13590         return false;
13591     }
13592   else
13593     {
13594       mem_1 = operands[0];
13595       mem_2 = operands[2];
13596       reg_1 = operands[1];
13597       reg_2 = operands[3];
13598     }
13599
13600   /* The mems cannot be volatile.  */
13601   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
13602     return false;
13603
13604   /* Check if the addresses are in the form of [base+offset].  */
13605   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13606   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13607     return false;
13608   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13609   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13610     return false;
13611
13612   /* Check if the bases are same.  */
13613   if (!rtx_equal_p (base_1, base_2))
13614     return false;
13615
13616   offval_1 = INTVAL (offset_1);
13617   offval_2 = INTVAL (offset_2);
13618   msize = GET_MODE_SIZE (mode);
13619   /* Check if the offsets are consecutive.  */
13620   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
13621     return false;
13622
13623   /* Check if the addresses are clobbered by load.  */
13624   if (load)
13625     {
13626       if (reg_mentioned_p (reg_1, mem_1))
13627         return false;
13628
13629       /* In increasing order, the last load can clobber the address.  */
13630       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
13631       return false;
13632     }
13633
13634   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13635     rclass_1 = FP_REGS;
13636   else
13637     rclass_1 = GENERAL_REGS;
13638
13639   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13640     rclass_2 = FP_REGS;
13641   else
13642     rclass_2 = GENERAL_REGS;
13643
13644   /* Check if the registers are of same class.  */
13645   if (rclass_1 != rclass_2)
13646     return false;
13647
13648   return true;
13649 }
13650
13651 /* Given OPERANDS of consecutive load/store, check if we can merge
13652    them into ldp/stp by adjusting the offset.  LOAD is true if they
13653    are load instructions.  MODE is the mode of memory operands.
13654
13655    Given below consecutive stores:
13656
13657      str  w1, [xb, 0x100]
13658      str  w1, [xb, 0x104]
13659      str  w1, [xb, 0x108]
13660      str  w1, [xb, 0x10c]
13661
13662    Though the offsets are out of the range supported by stp, we can
13663    still pair them after adjusting the offset, like:
13664
13665      add  scratch, xb, 0x100
13666      stp  w1, w1, [scratch]
13667      stp  w1, w1, [scratch, 0x8]
13668
13669    The peephole patterns detecting this opportunity should guarantee
13670    the scratch register is avaliable.  */
13671
13672 bool
13673 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
13674                                        enum machine_mode mode)
13675 {
13676   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
13677   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
13678   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
13679   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
13680
13681   if (load)
13682     {
13683       reg_1 = operands[0];
13684       mem_1 = operands[1];
13685       reg_2 = operands[2];
13686       mem_2 = operands[3];
13687       reg_3 = operands[4];
13688       mem_3 = operands[5];
13689       reg_4 = operands[6];
13690       mem_4 = operands[7];
13691       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
13692                   && REG_P (reg_3) && REG_P (reg_4));
13693       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
13694         return false;
13695     }
13696   else
13697     {
13698       mem_1 = operands[0];
13699       reg_1 = operands[1];
13700       mem_2 = operands[2];
13701       reg_2 = operands[3];
13702       mem_3 = operands[4];
13703       reg_3 = operands[5];
13704       mem_4 = operands[6];
13705       reg_4 = operands[7];
13706     }
13707   /* Skip if memory operand is by itslef valid for ldp/stp.  */
13708   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
13709     return false;
13710
13711   /* The mems cannot be volatile.  */
13712   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
13713       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
13714     return false;
13715
13716   /* Check if the addresses are in the form of [base+offset].  */
13717   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13718   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13719     return false;
13720   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13721   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13722     return false;
13723   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
13724   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
13725     return false;
13726   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
13727   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
13728     return false;
13729
13730   /* Check if the bases are same.  */
13731   if (!rtx_equal_p (base_1, base_2)
13732       || !rtx_equal_p (base_2, base_3)
13733       || !rtx_equal_p (base_3, base_4))
13734     return false;
13735
13736   offval_1 = INTVAL (offset_1);
13737   offval_2 = INTVAL (offset_2);
13738   offval_3 = INTVAL (offset_3);
13739   offval_4 = INTVAL (offset_4);
13740   msize = GET_MODE_SIZE (mode);
13741   /* Check if the offsets are consecutive.  */
13742   if ((offval_1 != (offval_2 + msize)
13743        || offval_1 != (offval_3 + msize * 2)
13744        || offval_1 != (offval_4 + msize * 3))
13745       && (offval_4 != (offval_3 + msize)
13746           || offval_4 != (offval_2 + msize * 2)
13747           || offval_4 != (offval_1 + msize * 3)))
13748     return false;
13749
13750   /* Check if the addresses are clobbered by load.  */
13751   if (load)
13752     {
13753       if (reg_mentioned_p (reg_1, mem_1)
13754           || reg_mentioned_p (reg_2, mem_2)
13755           || reg_mentioned_p (reg_3, mem_3))
13756         return false;
13757
13758       /* In increasing order, the last load can clobber the address.  */
13759       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
13760         return false;
13761     }
13762
13763   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13764     rclass_1 = FP_REGS;
13765   else
13766     rclass_1 = GENERAL_REGS;
13767
13768   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13769     rclass_2 = FP_REGS;
13770   else
13771     rclass_2 = GENERAL_REGS;
13772
13773   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
13774     rclass_3 = FP_REGS;
13775   else
13776     rclass_3 = GENERAL_REGS;
13777
13778   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
13779     rclass_4 = FP_REGS;
13780   else
13781     rclass_4 = GENERAL_REGS;
13782
13783   /* Check if the registers are of same class.  */
13784   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
13785     return false;
13786
13787   return true;
13788 }
13789
13790 /* Given OPERANDS of consecutive load/store, this function pairs them
13791    into ldp/stp after adjusting the offset.  It depends on the fact
13792    that addresses of load/store instructions are in increasing order.
13793    MODE is the mode of memory operands.  CODE is the rtl operator
13794    which should be applied to all memory operands, it's SIGN_EXTEND,
13795    ZERO_EXTEND or UNKNOWN.  */
13796
13797 bool
13798 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
13799                              enum machine_mode mode, RTX_CODE code)
13800 {
13801   rtx base, offset, t1, t2;
13802   rtx mem_1, mem_2, mem_3, mem_4;
13803   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
13804
13805   if (load)
13806     {
13807       mem_1 = operands[1];
13808       mem_2 = operands[3];
13809       mem_3 = operands[5];
13810       mem_4 = operands[7];
13811     }
13812   else
13813     {
13814       mem_1 = operands[0];
13815       mem_2 = operands[2];
13816       mem_3 = operands[4];
13817       mem_4 = operands[6];
13818       gcc_assert (code == UNKNOWN);
13819     }
13820
13821   extract_base_offset_in_addr (mem_1, &base, &offset);
13822   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
13823
13824   /* Adjust offset thus it can fit in ldp/stp instruction.  */
13825   msize = GET_MODE_SIZE (mode);
13826   stp_off_limit = msize * 0x40;
13827   off_val = INTVAL (offset);
13828   abs_off = (off_val < 0) ? -off_val : off_val;
13829   new_off = abs_off % stp_off_limit;
13830   adj_off = abs_off - new_off;
13831
13832   /* Further adjust to make sure all offsets are OK.  */
13833   if ((new_off + msize * 2) >= stp_off_limit)
13834     {
13835       adj_off += stp_off_limit;
13836       new_off -= stp_off_limit;
13837     }
13838
13839   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
13840   if (adj_off >= 0x1000)
13841     return false;
13842
13843   if (off_val < 0)
13844     {
13845       adj_off = -adj_off;
13846       new_off = -new_off;
13847     }
13848
13849   /* Create new memory references.  */
13850   mem_1 = change_address (mem_1, VOIDmode,
13851                           plus_constant (DImode, operands[8], new_off));
13852
13853   /* Check if the adjusted address is OK for ldp/stp.  */
13854   if (!aarch64_mem_pair_operand (mem_1, mode))
13855     return false;
13856
13857   msize = GET_MODE_SIZE (mode);
13858   mem_2 = change_address (mem_2, VOIDmode,
13859                           plus_constant (DImode,
13860                                          operands[8],
13861                                          new_off + msize));
13862   mem_3 = change_address (mem_3, VOIDmode,
13863                           plus_constant (DImode,
13864                                          operands[8],
13865                                          new_off + msize * 2));
13866   mem_4 = change_address (mem_4, VOIDmode,
13867                           plus_constant (DImode,
13868                                          operands[8],
13869                                          new_off + msize * 3));
13870
13871   if (code == ZERO_EXTEND)
13872     {
13873       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
13874       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
13875       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
13876       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
13877     }
13878   else if (code == SIGN_EXTEND)
13879     {
13880       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
13881       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
13882       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
13883       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
13884     }
13885
13886   if (load)
13887     {
13888       operands[1] = mem_1;
13889       operands[3] = mem_2;
13890       operands[5] = mem_3;
13891       operands[7] = mem_4;
13892     }
13893   else
13894     {
13895       operands[0] = mem_1;
13896       operands[2] = mem_2;
13897       operands[4] = mem_3;
13898       operands[6] = mem_4;
13899     }
13900
13901   /* Emit adjusting instruction.  */
13902   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
13903   /* Emit ldp/stp instructions.  */
13904   t1 = gen_rtx_SET (operands[0], operands[1]);
13905   t2 = gen_rtx_SET (operands[2], operands[3]);
13906   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13907   t1 = gen_rtx_SET (operands[4], operands[5]);
13908   t2 = gen_rtx_SET (operands[6], operands[7]);
13909   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13910   return true;
13911 }
13912
13913 /* Return 1 if pseudo register should be created and used to hold
13914    GOT address for PIC code.  */
13915
13916 bool
13917 aarch64_use_pseudo_pic_reg (void)
13918 {
13919   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
13920 }
13921
13922 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
13923
13924 static int
13925 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
13926 {
13927   switch (XINT (x, 1))
13928     {
13929     case UNSPEC_GOTSMALLPIC:
13930     case UNSPEC_GOTSMALLPIC28K:
13931     case UNSPEC_GOTTINYPIC:
13932       return 0;
13933     default:
13934       break;
13935     }
13936
13937   return default_unspec_may_trap_p (x, flags);
13938 }
13939
13940
13941 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
13942    return the log2 of that value.  Otherwise return -1.  */
13943
13944 int
13945 aarch64_fpconst_pow_of_2 (rtx x)
13946 {
13947   const REAL_VALUE_TYPE *r;
13948
13949   if (!CONST_DOUBLE_P (x))
13950     return -1;
13951
13952   r = CONST_DOUBLE_REAL_VALUE (x);
13953
13954   if (REAL_VALUE_NEGATIVE (*r)
13955       || REAL_VALUE_ISNAN (*r)
13956       || REAL_VALUE_ISINF (*r)
13957       || !real_isinteger (r, DFmode))
13958     return -1;
13959
13960   return exact_log2 (real_to_integer (r));
13961 }
13962
13963 /* If X is a vector of equal CONST_DOUBLE values and that value is
13964    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
13965
13966 int
13967 aarch64_vec_fpconst_pow_of_2 (rtx x)
13968 {
13969   if (GET_CODE (x) != CONST_VECTOR)
13970     return -1;
13971
13972   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
13973     return -1;
13974
13975   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
13976   if (firstval <= 0)
13977     return -1;
13978
13979   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
13980     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
13981       return -1;
13982
13983   return firstval;
13984 }
13985
13986 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float.  */
13987 static tree
13988 aarch64_promoted_type (const_tree t)
13989 {
13990   if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
13991     return float_type_node;
13992   return NULL_TREE;
13993 }
13994
13995 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
13996
13997 static bool
13998 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
13999                            optimization_type opt_type)
14000 {
14001   switch (op)
14002     {
14003     case rsqrt_optab:
14004       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14005
14006     default:
14007       return true;
14008     }
14009 }
14010
14011 #undef TARGET_ADDRESS_COST
14012 #define TARGET_ADDRESS_COST aarch64_address_cost
14013
14014 /* This hook will determines whether unnamed bitfields affect the alignment
14015    of the containing structure.  The hook returns true if the structure
14016    should inherit the alignment requirements of an unnamed bitfield's
14017    type.  */
14018 #undef TARGET_ALIGN_ANON_BITFIELD
14019 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14020
14021 #undef TARGET_ASM_ALIGNED_DI_OP
14022 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14023
14024 #undef TARGET_ASM_ALIGNED_HI_OP
14025 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14026
14027 #undef TARGET_ASM_ALIGNED_SI_OP
14028 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14029
14030 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14031 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14032   hook_bool_const_tree_hwi_hwi_const_tree_true
14033
14034 #undef TARGET_ASM_FILE_START
14035 #define TARGET_ASM_FILE_START aarch64_start_file
14036
14037 #undef TARGET_ASM_OUTPUT_MI_THUNK
14038 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14039
14040 #undef TARGET_ASM_SELECT_RTX_SECTION
14041 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14042
14043 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14044 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14045
14046 #undef TARGET_BUILD_BUILTIN_VA_LIST
14047 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14048
14049 #undef TARGET_CALLEE_COPIES
14050 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14051
14052 #undef TARGET_CAN_ELIMINATE
14053 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14054
14055 #undef TARGET_CAN_INLINE_P
14056 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14057
14058 #undef TARGET_CANNOT_FORCE_CONST_MEM
14059 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14060
14061 #undef TARGET_CASE_VALUES_THRESHOLD
14062 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14063
14064 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14065 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14066
14067 /* Only the least significant bit is used for initialization guard
14068    variables.  */
14069 #undef TARGET_CXX_GUARD_MASK_BIT
14070 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14071
14072 #undef TARGET_C_MODE_FOR_SUFFIX
14073 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14074
14075 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14076 #undef  TARGET_DEFAULT_TARGET_FLAGS
14077 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14078 #endif
14079
14080 #undef TARGET_CLASS_MAX_NREGS
14081 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14082
14083 #undef TARGET_BUILTIN_DECL
14084 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14085
14086 #undef TARGET_BUILTIN_RECIPROCAL
14087 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14088
14089 #undef  TARGET_EXPAND_BUILTIN
14090 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14091
14092 #undef TARGET_EXPAND_BUILTIN_VA_START
14093 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14094
14095 #undef TARGET_FOLD_BUILTIN
14096 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14097
14098 #undef TARGET_FUNCTION_ARG
14099 #define TARGET_FUNCTION_ARG aarch64_function_arg
14100
14101 #undef TARGET_FUNCTION_ARG_ADVANCE
14102 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14103
14104 #undef TARGET_FUNCTION_ARG_BOUNDARY
14105 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14106
14107 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14108 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14109
14110 #undef TARGET_FUNCTION_VALUE
14111 #define TARGET_FUNCTION_VALUE aarch64_function_value
14112
14113 #undef TARGET_FUNCTION_VALUE_REGNO_P
14114 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14115
14116 #undef TARGET_FRAME_POINTER_REQUIRED
14117 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14118
14119 #undef TARGET_GIMPLE_FOLD_BUILTIN
14120 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14121
14122 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14123 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14124
14125 #undef  TARGET_INIT_BUILTINS
14126 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
14127
14128 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14129 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14130   aarch64_ira_change_pseudo_allocno_class
14131
14132 #undef TARGET_LEGITIMATE_ADDRESS_P
14133 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14134
14135 #undef TARGET_LEGITIMATE_CONSTANT_P
14136 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14137
14138 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14139 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14140
14141 #undef TARGET_LRA_P
14142 #define TARGET_LRA_P hook_bool_void_true
14143
14144 #undef TARGET_MANGLE_TYPE
14145 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14146
14147 #undef TARGET_MEMORY_MOVE_COST
14148 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14149
14150 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14151 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14152
14153 #undef TARGET_MUST_PASS_IN_STACK
14154 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14155
14156 /* This target hook should return true if accesses to volatile bitfields
14157    should use the narrowest mode possible.  It should return false if these
14158    accesses should use the bitfield container type.  */
14159 #undef TARGET_NARROW_VOLATILE_BITFIELD
14160 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14161
14162 #undef  TARGET_OPTION_OVERRIDE
14163 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14164
14165 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14166 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14167   aarch64_override_options_after_change
14168
14169 #undef TARGET_OPTION_SAVE
14170 #define TARGET_OPTION_SAVE aarch64_option_save
14171
14172 #undef TARGET_OPTION_RESTORE
14173 #define TARGET_OPTION_RESTORE aarch64_option_restore
14174
14175 #undef TARGET_OPTION_PRINT
14176 #define TARGET_OPTION_PRINT aarch64_option_print
14177
14178 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14179 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14180
14181 #undef TARGET_SET_CURRENT_FUNCTION
14182 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14183
14184 #undef TARGET_PASS_BY_REFERENCE
14185 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14186
14187 #undef TARGET_PREFERRED_RELOAD_CLASS
14188 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14189
14190 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14191 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14192
14193 #undef TARGET_PROMOTED_TYPE
14194 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14195
14196 #undef TARGET_SECONDARY_RELOAD
14197 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14198
14199 #undef TARGET_SHIFT_TRUNCATION_MASK
14200 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14201
14202 #undef TARGET_SETUP_INCOMING_VARARGS
14203 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14204
14205 #undef TARGET_STRUCT_VALUE_RTX
14206 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
14207
14208 #undef TARGET_REGISTER_MOVE_COST
14209 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14210
14211 #undef TARGET_RETURN_IN_MEMORY
14212 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14213
14214 #undef TARGET_RETURN_IN_MSB
14215 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14216
14217 #undef TARGET_RTX_COSTS
14218 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14219
14220 #undef TARGET_SCHED_ISSUE_RATE
14221 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14222
14223 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14224 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14225   aarch64_sched_first_cycle_multipass_dfa_lookahead
14226
14227 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14228 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14229   aarch64_first_cycle_multipass_dfa_lookahead_guard
14230
14231 #undef TARGET_TRAMPOLINE_INIT
14232 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14233
14234 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14235 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14236
14237 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14238 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14239
14240 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14241 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14242
14243 #undef TARGET_VECTORIZE_ADD_STMT_COST
14244 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14245
14246 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14247 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14248   aarch64_builtin_vectorization_cost
14249
14250 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14251 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14252
14253 #undef TARGET_VECTORIZE_BUILTINS
14254 #define TARGET_VECTORIZE_BUILTINS
14255
14256 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14257 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14258   aarch64_builtin_vectorized_function
14259
14260 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14261 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14262   aarch64_autovectorize_vector_sizes
14263
14264 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14265 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14266   aarch64_atomic_assign_expand_fenv
14267
14268 /* Section anchor support.  */
14269
14270 #undef TARGET_MIN_ANCHOR_OFFSET
14271 #define TARGET_MIN_ANCHOR_OFFSET -256
14272
14273 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14274    byte offset; we can do much more for larger data types, but have no way
14275    to determine the size of the access.  We assume accesses are aligned.  */
14276 #undef TARGET_MAX_ANCHOR_OFFSET
14277 #define TARGET_MAX_ANCHOR_OFFSET 4095
14278
14279 #undef TARGET_VECTOR_ALIGNMENT
14280 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14281
14282 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14283 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14284   aarch64_simd_vector_alignment_reachable
14285
14286 /* vec_perm support.  */
14287
14288 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14289 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14290   aarch64_vectorize_vec_perm_const_ok
14291
14292 #undef TARGET_INIT_LIBFUNCS
14293 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14294
14295 #undef TARGET_FIXED_CONDITION_CODE_REGS
14296 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14297
14298 #undef TARGET_FLAGS_REGNUM
14299 #define TARGET_FLAGS_REGNUM CC_REGNUM
14300
14301 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14302 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14303
14304 #undef TARGET_ASAN_SHADOW_OFFSET
14305 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14306
14307 #undef TARGET_LEGITIMIZE_ADDRESS
14308 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14309
14310 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14311 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14312   aarch64_use_by_pieces_infrastructure_p
14313
14314 #undef TARGET_CAN_USE_DOLOOP_P
14315 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14316
14317 #undef TARGET_SCHED_MACRO_FUSION_P
14318 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14319
14320 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14321 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14322
14323 #undef TARGET_SCHED_FUSION_PRIORITY
14324 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14325
14326 #undef TARGET_UNSPEC_MAY_TRAP_P
14327 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14328
14329 #undef TARGET_USE_PSEUDO_PIC_REG
14330 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14331
14332 #undef TARGET_PRINT_OPERAND
14333 #define TARGET_PRINT_OPERAND aarch64_print_operand
14334
14335 #undef TARGET_PRINT_OPERAND_ADDRESS
14336 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14337
14338 #undef TARGET_OPTAB_SUPPORTED_P
14339 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14340
14341 #undef TARGET_OMIT_STRUCT_RETURN_REG
14342 #define TARGET_OMIT_STRUCT_RETURN_REG true
14343
14344 struct gcc_target targetm = TARGET_INITIALIZER;
14345
14346 #include "gt-aarch64.h"