gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2016 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "cfgloop.h"
  32 #include "df.h"
  33 #include "tm_p.h"
  34 #include "stringpool.h"
  35 #include "optabs.h"
  36 #include "regs.h"
  37 #include "emit-rtl.h"
  38 #include "recog.h"
  39 #include "diagnostic.h"
  40 #include "insn-attr.h"
  41 #include "alias.h"
  42 #include "fold-const.h"
  43 #include "stor-layout.h"
  44 #include "calls.h"
  45 #include "varasm.h"
  46 #include "output.h"
  47 #include "flags.h"
  48 #include "explow.h"
  49 #include "expr.h"
  50 #include "reload.h"
  51 #include "langhooks.h"
  52 #include "opts.h"
  53 #include "params.h"
  54 #include "gimplify.h"
  55 #include "dwarf2.h"
  56 #include "gimple-iterator.h"
  57 #include "tree-vectorizer.h"
  58 #include "aarch64-cost-tables.h"
  59 #include "dumpfile.h"
  60 #include "builtins.h"
  61 #include "rtl-iter.h"
  62 #include "tm-constrs.h"
  63 #include "sched-int.h"
  64 #include "cortex-a57-fma-steering.h"
  65 #include "target-globals.h"
  66 #include "common/common-target.h"
  67
  68 /* This file should be included last.  */
  69 #include "target-def.h"
  70
  71 /* Defined for convenience.  */
  72 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  73
  74 /* Classifies an address.
  75
  76    ADDRESS_REG_IMM
  77        A simple base register plus immediate offset.
  78
  79    ADDRESS_REG_WB
  80        A base register indexed by immediate offset with writeback.
  81
  82    ADDRESS_REG_REG
  83        A base register indexed by (optionally scaled) register.
  84
  85    ADDRESS_REG_UXTW
  86        A base register indexed by (optionally scaled) zero-extended register.
  87
  88    ADDRESS_REG_SXTW
  89        A base register indexed by (optionally scaled) sign-extended register.
  90
  91    ADDRESS_LO_SUM
  92        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  93
  94    ADDRESS_SYMBOLIC:
  95        A constant symbolic address, in pc-relative literal pool.  */
  96
  97 enum aarch64_address_type {
  98   ADDRESS_REG_IMM,
  99   ADDRESS_REG_WB,
 100   ADDRESS_REG_REG,
 101   ADDRESS_REG_UXTW,
 102   ADDRESS_REG_SXTW,
 103   ADDRESS_LO_SUM,
 104   ADDRESS_SYMBOLIC
 105 };
 106
 107 struct aarch64_address_info {
 108   enum aarch64_address_type type;
 109   rtx base;
 110   rtx offset;
 111   int shift;
 112   enum aarch64_symbol_type symbol_type;
 113 };
 114
 115 struct simd_immediate_info
 116 {
 117   rtx value;
 118   int shift;
 119   int element_width;
 120   bool mvn;
 121   bool msl;
 122 };
 123
 124 /* The current code model.  */
 125 enum aarch64_code_model aarch64_cmodel;
 126
 127 #ifdef HAVE_AS_TLS
 128 #undef TARGET_HAVE_TLS
 129 #define TARGET_HAVE_TLS 1
 130 #endif
 131
 132 static bool aarch64_composite_type_p (const_tree, machine_mode);
 133 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 134                                                      const_tree,
 135                                                      machine_mode *, int *,
 136                                                      bool *);
 137 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 138 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 139 static void aarch64_override_options_after_change (void);
 140 static bool aarch64_vector_mode_supported_p (machine_mode);
 141 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 142                                                  const unsigned char *sel);
 143 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 144
 145 /* Major revision number of the ARM Architecture implemented by the target.  */
 146 unsigned aarch64_architecture_version;
 147
 148 /* The processor for which instructions should be scheduled.  */
 149 enum aarch64_processor aarch64_tune = cortexa53;
 150
 151 /* Mask to specify which instruction scheduling options should be used.  */
 152 unsigned long aarch64_tune_flags = 0;
 153
 154 /* Global flag for PC relative loads.  */
 155 bool aarch64_nopcrelative_literal_loads;
 156
 157 /* Support for command line parsing of boolean flags in the tuning
 158    structures.  */
 159 struct aarch64_flag_desc
 160 {
 161   const char* name;
 162   unsigned int flag;
 163 };
 164
 165 #define AARCH64_FUSION_PAIR(name, internal_name) \
 166   { name, AARCH64_FUSE_##internal_name },
 167 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 168 {
 169   { "none", AARCH64_FUSE_NOTHING },
 170 #include "aarch64-fusion-pairs.def"
 171   { "all", AARCH64_FUSE_ALL },
 172   { NULL, AARCH64_FUSE_NOTHING }
 173 };
 174 #undef AARCH64_FUION_PAIR
 175
 176 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 177   { name, AARCH64_EXTRA_TUNE_##internal_name },
 178 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 179 {
 180   { "none", AARCH64_EXTRA_TUNE_NONE },
 181 #include "aarch64-tuning-flags.def"
 182   { "all", AARCH64_EXTRA_TUNE_ALL },
 183   { NULL, AARCH64_EXTRA_TUNE_NONE }
 184 };
 185 #undef AARCH64_EXTRA_TUNING_OPTION
 186
 187 /* Tuning parameters.  */
 188
 189 static const struct cpu_addrcost_table generic_addrcost_table =
 190 {
 191     {
 192       0, /* hi  */
 193       0, /* si  */
 194       0, /* di  */
 195       0, /* ti  */
 196     },
 197   0, /* pre_modify  */
 198   0, /* post_modify  */
 199   0, /* register_offset  */
 200   0, /* register_sextend  */
 201   0, /* register_zextend  */
 202   0 /* imm_offset  */
 203 };
 204
 205 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 206 {
 207     {
 208       1, /* hi  */
 209       0, /* si  */
 210       0, /* di  */
 211       1, /* ti  */
 212     },
 213   0, /* pre_modify  */
 214   0, /* post_modify  */
 215   0, /* register_offset  */
 216   0, /* register_sextend  */
 217   0, /* register_zextend  */
 218   0, /* imm_offset  */
 219 };
 220
 221 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 222 {
 223     {
 224       0, /* hi  */
 225       0, /* si  */
 226       0, /* di  */
 227       2, /* ti  */
 228     },
 229   0, /* pre_modify  */
 230   0, /* post_modify  */
 231   1, /* register_offset  */
 232   1, /* register_sextend  */
 233   2, /* register_zextend  */
 234   0, /* imm_offset  */
 235 };
 236
 237 static const struct cpu_addrcost_table xgene1_addrcost_table =
 238 {
 239     {
 240       1, /* hi  */
 241       0, /* si  */
 242       0, /* di  */
 243       1, /* ti  */
 244     },
 245   1, /* pre_modify  */
 246   0, /* post_modify  */
 247   0, /* register_offset  */
 248   1, /* register_sextend  */
 249   1, /* register_zextend  */
 250   0, /* imm_offset  */
 251 };
 252
 253 static const struct cpu_regmove_cost generic_regmove_cost =
 254 {
 255   1, /* GP2GP  */
 256   /* Avoid the use of slow int<->fp moves for spilling by setting
 257      their cost higher than memmov_cost.  */
 258   5, /* GP2FP  */
 259   5, /* FP2GP  */
 260   2 /* FP2FP  */
 261 };
 262
 263 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 264 {
 265   1, /* GP2GP  */
 266   /* Avoid the use of slow int<->fp moves for spilling by setting
 267      their cost higher than memmov_cost.  */
 268   5, /* GP2FP  */
 269   5, /* FP2GP  */
 270   2 /* FP2FP  */
 271 };
 272
 273 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 274 {
 275   1, /* GP2GP  */
 276   /* Avoid the use of slow int<->fp moves for spilling by setting
 277      their cost higher than memmov_cost.  */
 278   5, /* GP2FP  */
 279   5, /* FP2GP  */
 280   2 /* FP2FP  */
 281 };
 282
 283 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 284 {
 285   1, /* GP2GP  */
 286   /* Avoid the use of slow int<->fp moves for spilling by setting
 287      their cost higher than memmov_cost (actual, 4 and 9).  */
 288   9, /* GP2FP  */
 289   9, /* FP2GP  */
 290   1 /* FP2FP  */
 291 };
 292
 293 static const struct cpu_regmove_cost thunderx_regmove_cost =
 294 {
 295   2, /* GP2GP  */
 296   2, /* GP2FP  */
 297   6, /* FP2GP  */
 298   4 /* FP2FP  */
 299 };
 300
 301 static const struct cpu_regmove_cost xgene1_regmove_cost =
 302 {
 303   1, /* GP2GP  */
 304   /* Avoid the use of slow int<->fp moves for spilling by setting
 305      their cost higher than memmov_cost.  */
 306   8, /* GP2FP  */
 307   8, /* FP2GP  */
 308   2 /* FP2FP  */
 309 };
 310
 311 /* Generic costs for vector insn classes.  */
 312 static const struct cpu_vector_cost generic_vector_cost =
 313 {
 314   1, /* scalar_stmt_cost  */
 315   1, /* scalar_load_cost  */
 316   1, /* scalar_store_cost  */
 317   1, /* vec_stmt_cost  */
 318   2, /* vec_permute_cost  */
 319   1, /* vec_to_scalar_cost  */
 320   1, /* scalar_to_vec_cost  */
 321   1, /* vec_align_load_cost  */
 322   1, /* vec_unalign_load_cost  */
 323   1, /* vec_unalign_store_cost  */
 324   1, /* vec_store_cost  */
 325   3, /* cond_taken_branch_cost  */
 326   1 /* cond_not_taken_branch_cost  */
 327 };
 328
 329 /* Generic costs for vector insn classes.  */
 330 static const struct cpu_vector_cost cortexa57_vector_cost =
 331 {
 332   1, /* scalar_stmt_cost  */
 333   4, /* scalar_load_cost  */
 334   1, /* scalar_store_cost  */
 335   3, /* vec_stmt_cost  */
 336   3, /* vec_permute_cost  */
 337   8, /* vec_to_scalar_cost  */
 338   8, /* scalar_to_vec_cost  */
 339   5, /* vec_align_load_cost  */
 340   5, /* vec_unalign_load_cost  */
 341   1, /* vec_unalign_store_cost  */
 342   1, /* vec_store_cost  */
 343   1, /* cond_taken_branch_cost  */
 344   1 /* cond_not_taken_branch_cost  */
 345 };
 346
 347 static const struct cpu_vector_cost exynosm1_vector_cost =
 348 {
 349   1, /* scalar_stmt_cost  */
 350   5, /* scalar_load_cost  */
 351   1, /* scalar_store_cost  */
 352   3, /* vec_stmt_cost  */
 353   3, /* vec_permute_cost  */
 354   3, /* vec_to_scalar_cost  */
 355   3, /* scalar_to_vec_cost  */
 356   5, /* vec_align_load_cost  */
 357   5, /* vec_unalign_load_cost  */
 358   1, /* vec_unalign_store_cost  */
 359   1, /* vec_store_cost  */
 360   1, /* cond_taken_branch_cost  */
 361   1 /* cond_not_taken_branch_cost  */
 362 };
 363
 364 /* Generic costs for vector insn classes.  */
 365 static const struct cpu_vector_cost xgene1_vector_cost =
 366 {
 367   1, /* scalar_stmt_cost  */
 368   5, /* scalar_load_cost  */
 369   1, /* scalar_store_cost  */
 370   2, /* vec_stmt_cost  */
 371   2, /* vec_permute_cost  */
 372   4, /* vec_to_scalar_cost  */
 373   4, /* scalar_to_vec_cost  */
 374   10, /* vec_align_load_cost  */
 375   10, /* vec_unalign_load_cost  */
 376   2, /* vec_unalign_store_cost  */
 377   2, /* vec_store_cost  */
 378   2, /* cond_taken_branch_cost  */
 379   1 /* cond_not_taken_branch_cost  */
 380 };
 381
 382 /* Generic costs for branch instructions.  */
 383 static const struct cpu_branch_cost generic_branch_cost =
 384 {
 385   2,  /* Predictable.  */
 386   2   /* Unpredictable.  */
 387 };
 388
 389 /* Branch costs for Cortex-A57.  */
 390 static const struct cpu_branch_cost cortexa57_branch_cost =
 391 {
 392   1,  /* Predictable.  */
 393   3   /* Unpredictable.  */
 394 };
 395
 396 static const struct tune_params generic_tunings =
 397 {
 398   &cortexa57_extra_costs,
 399   &generic_addrcost_table,
 400   &generic_regmove_cost,
 401   &generic_vector_cost,
 402   &generic_branch_cost,
 403   4, /* memmov_cost  */
 404   2, /* issue_rate  */
 405   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 406   8,    /* function_align.  */
 407   8,    /* jump_align.  */
 408   4,    /* loop_align.  */
 409   2,    /* int_reassoc_width.  */
 410   4,    /* fp_reassoc_width.  */
 411   1,    /* vec_reassoc_width.  */
 412   2,    /* min_div_recip_mul_sf.  */
 413   2,    /* min_div_recip_mul_df.  */
 414   0,    /* max_case_values.  */
 415   0,    /* cache_line_size.  */
 416   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 417   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 418 };
 419
 420 static const struct tune_params cortexa35_tunings =
 421 {
 422   &cortexa53_extra_costs,
 423   &generic_addrcost_table,
 424   &cortexa53_regmove_cost,
 425   &generic_vector_cost,
 426   &generic_branch_cost,
 427   4, /* memmov_cost  */
 428   1, /* issue_rate  */
 429   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 430    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 431   8,    /* function_align.  */
 432   8,    /* jump_align.  */
 433   4,    /* loop_align.  */
 434   2,    /* int_reassoc_width.  */
 435   4,    /* fp_reassoc_width.  */
 436   1,    /* vec_reassoc_width.  */
 437   2,    /* min_div_recip_mul_sf.  */
 438   2,    /* min_div_recip_mul_df.  */
 439   0,    /* max_case_values.  */
 440   0,    /* cache_line_size.  */
 441   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 442   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 443 };
 444
 445 static const struct tune_params cortexa53_tunings =
 446 {
 447   &cortexa53_extra_costs,
 448   &generic_addrcost_table,
 449   &cortexa53_regmove_cost,
 450   &generic_vector_cost,
 451   &generic_branch_cost,
 452   4, /* memmov_cost  */
 453   2, /* issue_rate  */
 454   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 455    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 456   8,    /* function_align.  */
 457   8,    /* jump_align.  */
 458   4,    /* loop_align.  */
 459   2,    /* int_reassoc_width.  */
 460   4,    /* fp_reassoc_width.  */
 461   1,    /* vec_reassoc_width.  */
 462   2,    /* min_div_recip_mul_sf.  */
 463   2,    /* min_div_recip_mul_df.  */
 464   0,    /* max_case_values.  */
 465   0,    /* cache_line_size.  */
 466   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 467   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 468 };
 469
 470 static const struct tune_params cortexa57_tunings =
 471 {
 472   &cortexa57_extra_costs,
 473   &cortexa57_addrcost_table,
 474   &cortexa57_regmove_cost,
 475   &cortexa57_vector_cost,
 476   &cortexa57_branch_cost,
 477   4, /* memmov_cost  */
 478   3, /* issue_rate  */
 479   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 480    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 481   16,   /* function_align.  */
 482   8,    /* jump_align.  */
 483   4,    /* loop_align.  */
 484   2,    /* int_reassoc_width.  */
 485   4,    /* fp_reassoc_width.  */
 486   1,    /* vec_reassoc_width.  */
 487   2,    /* min_div_recip_mul_sf.  */
 488   2,    /* min_div_recip_mul_df.  */
 489   0,    /* max_case_values.  */
 490   0,    /* cache_line_size.  */
 491   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 492   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)  /* tune_flags.  */
 493 };
 494
 495 static const struct tune_params cortexa72_tunings =
 496 {
 497   &cortexa57_extra_costs,
 498   &cortexa57_addrcost_table,
 499   &cortexa57_regmove_cost,
 500   &cortexa57_vector_cost,
 501   &generic_branch_cost,
 502   4, /* memmov_cost  */
 503   3, /* issue_rate  */
 504   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 505    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 506   16,   /* function_align.  */
 507   8,    /* jump_align.  */
 508   4,    /* loop_align.  */
 509   2,    /* int_reassoc_width.  */
 510   4,    /* fp_reassoc_width.  */
 511   1,    /* vec_reassoc_width.  */
 512   2,    /* min_div_recip_mul_sf.  */
 513   2,    /* min_div_recip_mul_df.  */
 514   0,    /* max_case_values.  */
 515   0,    /* cache_line_size.  */
 516   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 517   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 518 };
 519
 520 static const struct tune_params exynosm1_tunings =
 521 {
 522   &exynosm1_extra_costs,
 523   &exynosm1_addrcost_table,
 524   &exynosm1_regmove_cost,
 525   &exynosm1_vector_cost,
 526   &generic_branch_cost,
 527   4,    /* memmov_cost  */
 528   3,    /* issue_rate  */
 529   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 530   4,    /* function_align.  */
 531   4,    /* jump_align.  */
 532   4,    /* loop_align.  */
 533   2,    /* int_reassoc_width.  */
 534   4,    /* fp_reassoc_width.  */
 535   1,    /* vec_reassoc_width.  */
 536   2,    /* min_div_recip_mul_sf.  */
 537   2,    /* min_div_recip_mul_df.  */
 538   48,   /* max_case_values.  */
 539   64,   /* cache_line_size.  */
 540   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 541   (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags.  */
 542 };
 543
 544 static const struct tune_params thunderx_tunings =
 545 {
 546   &thunderx_extra_costs,
 547   &generic_addrcost_table,
 548   &thunderx_regmove_cost,
 549   &generic_vector_cost,
 550   &generic_branch_cost,
 551   6, /* memmov_cost  */
 552   2, /* issue_rate  */
 553   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 554   8,    /* function_align.  */
 555   8,    /* jump_align.  */
 556   8,    /* loop_align.  */
 557   2,    /* int_reassoc_width.  */
 558   4,    /* fp_reassoc_width.  */
 559   1,    /* vec_reassoc_width.  */
 560   2,    /* min_div_recip_mul_sf.  */
 561   2,    /* min_div_recip_mul_df.  */
 562   0,    /* max_case_values.  */
 563   0,    /* cache_line_size.  */
 564   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 565   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 566 };
 567
 568 static const struct tune_params xgene1_tunings =
 569 {
 570   &xgene1_extra_costs,
 571   &xgene1_addrcost_table,
 572   &xgene1_regmove_cost,
 573   &xgene1_vector_cost,
 574   &generic_branch_cost,
 575   6, /* memmov_cost  */
 576   4, /* issue_rate  */
 577   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 578   16,   /* function_align.  */
 579   8,    /* jump_align.  */
 580   16,   /* loop_align.  */
 581   2,    /* int_reassoc_width.  */
 582   4,    /* fp_reassoc_width.  */
 583   1,    /* vec_reassoc_width.  */
 584   2,    /* min_div_recip_mul_sf.  */
 585   2,    /* min_div_recip_mul_df.  */
 586   0,    /* max_case_values.  */
 587   0,    /* cache_line_size.  */
 588   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 589   (AARCH64_EXTRA_TUNE_APPROX_RSQRT)     /* tune_flags.  */
 590 };
 591
 592 /* Support for fine-grained override of the tuning structures.  */
 593 struct aarch64_tuning_override_function
 594 {
 595   const char* name;
 596   void (*parse_override)(const char*, struct tune_params*);
 597 };
 598
 599 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 600 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 601
 602 static const struct aarch64_tuning_override_function
 603 aarch64_tuning_override_functions[] =
 604 {
 605   { "fuse", aarch64_parse_fuse_string },
 606   { "tune", aarch64_parse_tune_string },
 607   { NULL, NULL }
 608 };
 609
 610 /* A processor implementing AArch64.  */
 611 struct processor
 612 {
 613   const char *const name;
 614   enum aarch64_processor ident;
 615   enum aarch64_processor sched_core;
 616   enum aarch64_arch arch;
 617   unsigned architecture_version;
 618   const unsigned long flags;
 619   const struct tune_params *const tune;
 620 };
 621
 622 /* Architectures implementing AArch64.  */
 623 static const struct processor all_architectures[] =
 624 {
 625 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 626   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 627 #include "aarch64-arches.def"
 628 #undef AARCH64_ARCH
 629   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 630 };
 631
 632 /* Processor cores implementing AArch64.  */
 633 static const struct processor all_cores[] =
 634 {
 635 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 636   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 637   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 638   FLAGS, &COSTS##_tunings},
 639 #include "aarch64-cores.def"
 640 #undef AARCH64_CORE
 641   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 642     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 643   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 644 };
 645
 646
 647 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 648    handling code or by target attributes.  */
 649 static const struct processor *selected_arch;
 650 static const struct processor *selected_cpu;
 651 static const struct processor *selected_tune;
 652
 653 /* The current tuning set.  */
 654 struct tune_params aarch64_tune_params = generic_tunings;
 655
 656 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 657
 658 /* An ISA extension in the co-processor and main instruction set space.  */
 659 struct aarch64_option_extension
 660 {
 661   const char *const name;
 662   const unsigned long flags_on;
 663   const unsigned long flags_off;
 664 };
 665
 666 typedef enum aarch64_cond_code
 667 {
 668   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 669   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 670   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 671 }
 672 aarch64_cc;
 673
 674 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 675
 676 /* The condition codes of the processor, and the inverse function.  */
 677 static const char * const aarch64_condition_codes[] =
 678 {
 679   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 680   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 681 };
 682
 683 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 684 const char *
 685 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 686                         const char * branch_format)
 687 {
 688     rtx_code_label * tmp_label = gen_label_rtx ();
 689     char label_buf[256];
 690     char buffer[128];
 691     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 692                                  CODE_LABEL_NUMBER (tmp_label));
 693     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 694     rtx dest_label = operands[pos_label];
 695     operands[pos_label] = tmp_label;
 696
 697     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 698     output_asm_insn (buffer, operands);
 699
 700     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 701     operands[pos_label] = dest_label;
 702     output_asm_insn (buffer, operands);
 703     return "";
 704 }
 705
 706 void
 707 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 708 {
 709   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 710   if (TARGET_GENERAL_REGS_ONLY)
 711     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 712   else
 713     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 714 }
 715
 716 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 717    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 718    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 719    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 720    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 721    irrespectively of its cost results in bad allocations with many redundant
 722    int<->FP moves which are expensive on various cores.
 723    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 724    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 725    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 726    Otherwise set the allocno class depending on the mode.
 727    The result of this is that it is no longer inefficient to have a higher
 728    memory move cost than the register move cost.
 729 */
 730
 731 static reg_class_t
 732 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 733                                          reg_class_t best_class)
 734 {
 735   enum machine_mode mode;
 736
 737   if (allocno_class != ALL_REGS)
 738     return allocno_class;
 739
 740   if (best_class != ALL_REGS)
 741     return best_class;
 742
 743   mode = PSEUDO_REGNO_MODE (regno);
 744   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
 745 }
 746
 747 static unsigned int
 748 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 749 {
 750   if (GET_MODE_UNIT_SIZE (mode) == 4)
 751     return aarch64_tune_params.min_div_recip_mul_sf;
 752   return aarch64_tune_params.min_div_recip_mul_df;
 753 }
 754
 755 static int
 756 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 757                              enum machine_mode mode)
 758 {
 759   if (VECTOR_MODE_P (mode))
 760     return aarch64_tune_params.vec_reassoc_width;
 761   if (INTEGRAL_MODE_P (mode))
 762     return aarch64_tune_params.int_reassoc_width;
 763   if (FLOAT_MODE_P (mode))
 764     return aarch64_tune_params.fp_reassoc_width;
 765   return 1;
 766 }
 767
 768 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 769 unsigned
 770 aarch64_dbx_register_number (unsigned regno)
 771 {
 772    if (GP_REGNUM_P (regno))
 773      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 774    else if (regno == SP_REGNUM)
 775      return AARCH64_DWARF_SP;
 776    else if (FP_REGNUM_P (regno))
 777      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 778
 779    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 780       equivalent DWARF register.  */
 781    return DWARF_FRAME_REGISTERS;
 782 }
 783
 784 /* Return TRUE if MODE is any of the large INT modes.  */
 785 static bool
 786 aarch64_vect_struct_mode_p (machine_mode mode)
 787 {
 788   return mode == OImode || mode == CImode || mode == XImode;
 789 }
 790
 791 /* Return TRUE if MODE is any of the vector modes.  */
 792 static bool
 793 aarch64_vector_mode_p (machine_mode mode)
 794 {
 795   return aarch64_vector_mode_supported_p (mode)
 796          || aarch64_vect_struct_mode_p (mode);
 797 }
 798
 799 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 800 static bool
 801 aarch64_array_mode_supported_p (machine_mode mode,
 802                                 unsigned HOST_WIDE_INT nelems)
 803 {
 804   if (TARGET_SIMD
 805       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
 806           || AARCH64_VALID_SIMD_DREG_MODE (mode))
 807       && (nelems >= 2 && nelems <= 4))
 808     return true;
 809
 810   return false;
 811 }
 812
 813 /* Implement HARD_REGNO_NREGS.  */
 814
 815 int
 816 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 817 {
 818   switch (aarch64_regno_regclass (regno))
 819     {
 820     case FP_REGS:
 821     case FP_LO_REGS:
 822       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 823     default:
 824       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 825     }
 826   gcc_unreachable ();
 827 }
 828
 829 /* Implement HARD_REGNO_MODE_OK.  */
 830
 831 int
 832 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 833 {
 834   if (GET_MODE_CLASS (mode) == MODE_CC)
 835     return regno == CC_REGNUM;
 836
 837   if (regno == SP_REGNUM)
 838     /* The purpose of comparing with ptr_mode is to support the
 839        global register variable associated with the stack pointer
 840        register via the syntax of asm ("wsp") in ILP32.  */
 841     return mode == Pmode || mode == ptr_mode;
 842
 843   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 844     return mode == Pmode;
 845
 846   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 847     return 1;
 848
 849   if (FP_REGNUM_P (regno))
 850     {
 851       if (aarch64_vect_struct_mode_p (mode))
 852         return
 853           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 854       else
 855         return 1;
 856     }
 857
 858   return 0;
 859 }
 860
 861 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 862 machine_mode
 863 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 864                                      machine_mode mode)
 865 {
 866   /* Handle modes that fit within single registers.  */
 867   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 868     {
 869       if (GET_MODE_SIZE (mode) >= 4)
 870         return mode;
 871       else
 872         return SImode;
 873     }
 874   /* Fall back to generic for multi-reg and very large modes.  */
 875   else
 876     return choose_hard_reg_mode (regno, nregs, false);
 877 }
 878
 879 /* Return true if calls to DECL should be treated as
 880    long-calls (ie called via a register).  */
 881 static bool
 882 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 883 {
 884   return false;
 885 }
 886
 887 /* Return true if calls to symbol-ref SYM should be treated as
 888    long-calls (ie called via a register).  */
 889 bool
 890 aarch64_is_long_call_p (rtx sym)
 891 {
 892   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 893 }
 894
 895 /* Return true if calls to symbol-ref SYM should not go through
 896    plt stubs.  */
 897
 898 bool
 899 aarch64_is_noplt_call_p (rtx sym)
 900 {
 901   const_tree decl = SYMBOL_REF_DECL (sym);
 902
 903   if (flag_pic
 904       && decl
 905       && (!flag_plt
 906           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
 907       && !targetm.binds_local_p (decl))
 908     return true;
 909
 910   return false;
 911 }
 912
 913 /* Return true if the offsets to a zero/sign-extract operation
 914    represent an expression that matches an extend operation.  The
 915    operands represent the paramters from
 916
 917    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 918 bool
 919 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 920                                 rtx extract_imm)
 921 {
 922   HOST_WIDE_INT mult_val, extract_val;
 923
 924   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 925     return false;
 926
 927   mult_val = INTVAL (mult_imm);
 928   extract_val = INTVAL (extract_imm);
 929
 930   if (extract_val > 8
 931       && extract_val < GET_MODE_BITSIZE (mode)
 932       && exact_log2 (extract_val & ~7) > 0
 933       && (extract_val & 7) <= 4
 934       && mult_val == (1 << (extract_val & 7)))
 935     return true;
 936
 937   return false;
 938 }
 939
 940 /* Emit an insn that's a simple single-set.  Both the operands must be
 941    known to be valid.  */
 942 inline static rtx
 943 emit_set_insn (rtx x, rtx y)
 944 {
 945   return emit_insn (gen_rtx_SET (x, y));
 946 }
 947
 948 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 949    return the rtx for register 0 in the proper mode.  */
 950 rtx
 951 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 952 {
 953   machine_mode mode = SELECT_CC_MODE (code, x, y);
 954   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 955
 956   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 957   return cc_reg;
 958 }
 959
 960 /* Build the SYMBOL_REF for __tls_get_addr.  */
 961
 962 static GTY(()) rtx tls_get_addr_libfunc;
 963
 964 rtx
 965 aarch64_tls_get_addr (void)
 966 {
 967   if (!tls_get_addr_libfunc)
 968     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 969   return tls_get_addr_libfunc;
 970 }
 971
 972 /* Return the TLS model to use for ADDR.  */
 973
 974 static enum tls_model
 975 tls_symbolic_operand_type (rtx addr)
 976 {
 977   enum tls_model tls_kind = TLS_MODEL_NONE;
 978   rtx sym, addend;
 979
 980   if (GET_CODE (addr) == CONST)
 981     {
 982       split_const (addr, &sym, &addend);
 983       if (GET_CODE (sym) == SYMBOL_REF)
 984         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 985     }
 986   else if (GET_CODE (addr) == SYMBOL_REF)
 987     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 988
 989   return tls_kind;
 990 }
 991
 992 /* We'll allow lo_sum's in addresses in our legitimate addresses
 993    so that combine would take care of combining addresses where
 994    necessary, but for generation purposes, we'll generate the address
 995    as :
 996    RTL                               Absolute
 997    tmp = hi (symbol_ref);            adrp  x1, foo
 998    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 999                                      nop
1000
1001    PIC                               TLS
1002    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1003    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1004                                      bl   __tls_get_addr
1005                                      nop
1006
1007    Load TLS symbol, depending on TLS mechanism and TLS access model.
1008
1009    Global Dynamic - Traditional TLS:
1010    adrp tmp, :tlsgd:imm
1011    add  dest, tmp, #:tlsgd_lo12:imm
1012    bl   __tls_get_addr
1013
1014    Global Dynamic - TLS Descriptors:
1015    adrp dest, :tlsdesc:imm
1016    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1017    add  dest, dest, #:tlsdesc_lo12:imm
1018    blr  tmp
1019    mrs  tp, tpidr_el0
1020    add  dest, dest, tp
1021
1022    Initial Exec:
1023    mrs  tp, tpidr_el0
1024    adrp tmp, :gottprel:imm
1025    ldr  dest, [tmp, #:gottprel_lo12:imm]
1026    add  dest, dest, tp
1027
1028    Local Exec:
1029    mrs  tp, tpidr_el0
1030    add  t0, tp, #:tprel_hi12:imm, lsl #12
1031    add  t0, t0, #:tprel_lo12_nc:imm
1032 */
1033
1034 static void
1035 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1036                                    enum aarch64_symbol_type type)
1037 {
1038   switch (type)
1039     {
1040     case SYMBOL_SMALL_ABSOLUTE:
1041       {
1042         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1043         rtx tmp_reg = dest;
1044         machine_mode mode = GET_MODE (dest);
1045
1046         gcc_assert (mode == Pmode || mode == ptr_mode);
1047
1048         if (can_create_pseudo_p ())
1049           tmp_reg = gen_reg_rtx (mode);
1050
1051         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1052         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1053         return;
1054       }
1055
1056     case SYMBOL_TINY_ABSOLUTE:
1057       emit_insn (gen_rtx_SET (dest, imm));
1058       return;
1059
1060     case SYMBOL_SMALL_GOT_28K:
1061       {
1062         machine_mode mode = GET_MODE (dest);
1063         rtx gp_rtx = pic_offset_table_rtx;
1064         rtx insn;
1065         rtx mem;
1066
1067         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1068            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1069            decide rtx costs, in which case pic_offset_table_rtx is not
1070            initialized.  For that case no need to generate the first adrp
1071            instruction as the final cost for global variable access is
1072            one instruction.  */
1073         if (gp_rtx != NULL)
1074           {
1075             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1076                using the page base as GOT base, the first page may be wasted,
1077                in the worst scenario, there is only 28K space for GOT).
1078
1079                The generate instruction sequence for accessing global variable
1080                is:
1081
1082                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1083
1084                Only one instruction needed. But we must initialize
1085                pic_offset_table_rtx properly.  We generate initialize insn for
1086                every global access, and allow CSE to remove all redundant.
1087
1088                The final instruction sequences will look like the following
1089                for multiply global variables access.
1090
1091                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1092
1093                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1094                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1095                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1096                  ...  */
1097
1098             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1099             crtl->uses_pic_offset_table = 1;
1100             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1101
1102             if (mode != GET_MODE (gp_rtx))
1103               gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
1104           }
1105
1106         if (mode == ptr_mode)
1107           {
1108             if (mode == DImode)
1109               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1110             else
1111               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1112
1113             mem = XVECEXP (SET_SRC (insn), 0, 0);
1114           }
1115         else
1116           {
1117             gcc_assert (mode == Pmode);
1118
1119             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1120             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1121           }
1122
1123         /* The operand is expected to be MEM.  Whenever the related insn
1124            pattern changed, above code which calculate mem should be
1125            updated.  */
1126         gcc_assert (GET_CODE (mem) == MEM);
1127         MEM_READONLY_P (mem) = 1;
1128         MEM_NOTRAP_P (mem) = 1;
1129         emit_insn (insn);
1130         return;
1131       }
1132
1133     case SYMBOL_SMALL_GOT_4G:
1134       {
1135         /* In ILP32, the mode of dest can be either SImode or DImode,
1136            while the got entry is always of SImode size.  The mode of
1137            dest depends on how dest is used: if dest is assigned to a
1138            pointer (e.g. in the memory), it has SImode; it may have
1139            DImode if dest is dereferenced to access the memeory.
1140            This is why we have to handle three different ldr_got_small
1141            patterns here (two patterns for ILP32).  */
1142
1143         rtx insn;
1144         rtx mem;
1145         rtx tmp_reg = dest;
1146         machine_mode mode = GET_MODE (dest);
1147
1148         if (can_create_pseudo_p ())
1149           tmp_reg = gen_reg_rtx (mode);
1150
1151         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1152         if (mode == ptr_mode)
1153           {
1154             if (mode == DImode)
1155               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1156             else
1157               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1158
1159             mem = XVECEXP (SET_SRC (insn), 0, 0);
1160           }
1161         else
1162           {
1163             gcc_assert (mode == Pmode);
1164
1165             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1166             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1167           }
1168
1169         gcc_assert (GET_CODE (mem) == MEM);
1170         MEM_READONLY_P (mem) = 1;
1171         MEM_NOTRAP_P (mem) = 1;
1172         emit_insn (insn);
1173         return;
1174       }
1175
1176     case SYMBOL_SMALL_TLSGD:
1177       {
1178         rtx_insn *insns;
1179         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1180
1181         start_sequence ();
1182         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1183         insns = get_insns ();
1184         end_sequence ();
1185
1186         RTL_CONST_CALL_P (insns) = 1;
1187         emit_libcall_block (insns, dest, result, imm);
1188         return;
1189       }
1190
1191     case SYMBOL_SMALL_TLSDESC:
1192       {
1193         machine_mode mode = GET_MODE (dest);
1194         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1195         rtx tp;
1196
1197         gcc_assert (mode == Pmode || mode == ptr_mode);
1198
1199         /* In ILP32, the got entry is always of SImode size.  Unlike
1200            small GOT, the dest is fixed at reg 0.  */
1201         if (TARGET_ILP32)
1202           emit_insn (gen_tlsdesc_small_si (imm));
1203         else
1204           emit_insn (gen_tlsdesc_small_di (imm));
1205         tp = aarch64_load_tp (NULL);
1206
1207         if (mode != Pmode)
1208           tp = gen_lowpart (mode, tp);
1209
1210         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1211         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1212         return;
1213       }
1214
1215     case SYMBOL_SMALL_TLSIE:
1216       {
1217         /* In ILP32, the mode of dest can be either SImode or DImode,
1218            while the got entry is always of SImode size.  The mode of
1219            dest depends on how dest is used: if dest is assigned to a
1220            pointer (e.g. in the memory), it has SImode; it may have
1221            DImode if dest is dereferenced to access the memeory.
1222            This is why we have to handle three different tlsie_small
1223            patterns here (two patterns for ILP32).  */
1224         machine_mode mode = GET_MODE (dest);
1225         rtx tmp_reg = gen_reg_rtx (mode);
1226         rtx tp = aarch64_load_tp (NULL);
1227
1228         if (mode == ptr_mode)
1229           {
1230             if (mode == DImode)
1231               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1232             else
1233               {
1234                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1235                 tp = gen_lowpart (mode, tp);
1236               }
1237           }
1238         else
1239           {
1240             gcc_assert (mode == Pmode);
1241             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1242           }
1243
1244         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1245         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1246         return;
1247       }
1248
1249     case SYMBOL_TLSLE12:
1250     case SYMBOL_TLSLE24:
1251     case SYMBOL_TLSLE32:
1252     case SYMBOL_TLSLE48:
1253       {
1254         machine_mode mode = GET_MODE (dest);
1255         rtx tp = aarch64_load_tp (NULL);
1256
1257         if (mode != Pmode)
1258           tp = gen_lowpart (mode, tp);
1259
1260         switch (type)
1261           {
1262           case SYMBOL_TLSLE12:
1263             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1264                         (dest, tp, imm));
1265             break;
1266           case SYMBOL_TLSLE24:
1267             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1268                         (dest, tp, imm));
1269           break;
1270           case SYMBOL_TLSLE32:
1271             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1272                         (dest, imm));
1273             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1274                         (dest, dest, tp));
1275           break;
1276           case SYMBOL_TLSLE48:
1277             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1278                         (dest, imm));
1279             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1280                         (dest, dest, tp));
1281             break;
1282           default:
1283             gcc_unreachable ();
1284           }
1285
1286         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1287         return;
1288       }
1289
1290     case SYMBOL_TINY_GOT:
1291       emit_insn (gen_ldr_got_tiny (dest, imm));
1292       return;
1293
1294     case SYMBOL_TINY_TLSIE:
1295       {
1296         machine_mode mode = GET_MODE (dest);
1297         rtx tp = aarch64_load_tp (NULL);
1298
1299         if (mode == ptr_mode)
1300           {
1301             if (mode == DImode)
1302               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1303             else
1304               {
1305                 tp = gen_lowpart (mode, tp);
1306                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1307               }
1308           }
1309         else
1310           {
1311             gcc_assert (mode == Pmode);
1312             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1313           }
1314
1315         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1316         return;
1317       }
1318
1319     default:
1320       gcc_unreachable ();
1321     }
1322 }
1323
1324 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1325    handle all moves if !can_create_pseudo_p ().  The distinction is
1326    important because, unlike emit_move_insn, the move expanders know
1327    how to force Pmode objects into the constant pool even when the
1328    constant pool address is not itself legitimate.  */
1329 static rtx
1330 aarch64_emit_move (rtx dest, rtx src)
1331 {
1332   return (can_create_pseudo_p ()
1333           ? emit_move_insn (dest, src)
1334           : emit_move_insn_1 (dest, src));
1335 }
1336
1337 /* Split a 128-bit move operation into two 64-bit move operations,
1338    taking care to handle partial overlap of register to register
1339    copies.  Special cases are needed when moving between GP regs and
1340    FP regs.  SRC can be a register, constant or memory; DST a register
1341    or memory.  If either operand is memory it must not have any side
1342    effects.  */
1343 void
1344 aarch64_split_128bit_move (rtx dst, rtx src)
1345 {
1346   rtx dst_lo, dst_hi;
1347   rtx src_lo, src_hi;
1348
1349   machine_mode mode = GET_MODE (dst);
1350
1351   gcc_assert (mode == TImode || mode == TFmode);
1352   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1353   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1354
1355   if (REG_P (dst) && REG_P (src))
1356     {
1357       int src_regno = REGNO (src);
1358       int dst_regno = REGNO (dst);
1359
1360       /* Handle FP <-> GP regs.  */
1361       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1362         {
1363           src_lo = gen_lowpart (word_mode, src);
1364           src_hi = gen_highpart (word_mode, src);
1365
1366           if (mode == TImode)
1367             {
1368               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1369               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1370             }
1371           else
1372             {
1373               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1374               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1375             }
1376           return;
1377         }
1378       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1379         {
1380           dst_lo = gen_lowpart (word_mode, dst);
1381           dst_hi = gen_highpart (word_mode, dst);
1382
1383           if (mode == TImode)
1384             {
1385               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1386               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1387             }
1388           else
1389             {
1390               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1391               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1392             }
1393           return;
1394         }
1395     }
1396
1397   dst_lo = gen_lowpart (word_mode, dst);
1398   dst_hi = gen_highpart (word_mode, dst);
1399   src_lo = gen_lowpart (word_mode, src);
1400   src_hi = gen_highpart_mode (word_mode, mode, src);
1401
1402   /* At most one pairing may overlap.  */
1403   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1404     {
1405       aarch64_emit_move (dst_hi, src_hi);
1406       aarch64_emit_move (dst_lo, src_lo);
1407     }
1408   else
1409     {
1410       aarch64_emit_move (dst_lo, src_lo);
1411       aarch64_emit_move (dst_hi, src_hi);
1412     }
1413 }
1414
1415 bool
1416 aarch64_split_128bit_move_p (rtx dst, rtx src)
1417 {
1418   return (! REG_P (src)
1419           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1420 }
1421
1422 /* Split a complex SIMD combine.  */
1423
1424 void
1425 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1426 {
1427   machine_mode src_mode = GET_MODE (src1);
1428   machine_mode dst_mode = GET_MODE (dst);
1429
1430   gcc_assert (VECTOR_MODE_P (dst_mode));
1431
1432   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1433     {
1434       rtx (*gen) (rtx, rtx, rtx);
1435
1436       switch (src_mode)
1437         {
1438         case V8QImode:
1439           gen = gen_aarch64_simd_combinev8qi;
1440           break;
1441         case V4HImode:
1442           gen = gen_aarch64_simd_combinev4hi;
1443           break;
1444         case V2SImode:
1445           gen = gen_aarch64_simd_combinev2si;
1446           break;
1447         case V4HFmode:
1448           gen = gen_aarch64_simd_combinev4hf;
1449           break;
1450         case V2SFmode:
1451           gen = gen_aarch64_simd_combinev2sf;
1452           break;
1453         case DImode:
1454           gen = gen_aarch64_simd_combinedi;
1455           break;
1456         case DFmode:
1457           gen = gen_aarch64_simd_combinedf;
1458           break;
1459         default:
1460           gcc_unreachable ();
1461         }
1462
1463       emit_insn (gen (dst, src1, src2));
1464       return;
1465     }
1466 }
1467
1468 /* Split a complex SIMD move.  */
1469
1470 void
1471 aarch64_split_simd_move (rtx dst, rtx src)
1472 {
1473   machine_mode src_mode = GET_MODE (src);
1474   machine_mode dst_mode = GET_MODE (dst);
1475
1476   gcc_assert (VECTOR_MODE_P (dst_mode));
1477
1478   if (REG_P (dst) && REG_P (src))
1479     {
1480       rtx (*gen) (rtx, rtx);
1481
1482       gcc_assert (VECTOR_MODE_P (src_mode));
1483
1484       switch (src_mode)
1485         {
1486         case V16QImode:
1487           gen = gen_aarch64_split_simd_movv16qi;
1488           break;
1489         case V8HImode:
1490           gen = gen_aarch64_split_simd_movv8hi;
1491           break;
1492         case V4SImode:
1493           gen = gen_aarch64_split_simd_movv4si;
1494           break;
1495         case V2DImode:
1496           gen = gen_aarch64_split_simd_movv2di;
1497           break;
1498         case V8HFmode:
1499           gen = gen_aarch64_split_simd_movv8hf;
1500           break;
1501         case V4SFmode:
1502           gen = gen_aarch64_split_simd_movv4sf;
1503           break;
1504         case V2DFmode:
1505           gen = gen_aarch64_split_simd_movv2df;
1506           break;
1507         default:
1508           gcc_unreachable ();
1509         }
1510
1511       emit_insn (gen (dst, src));
1512       return;
1513     }
1514 }
1515
1516 bool
1517 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1518                               machine_mode ymode, rtx y)
1519 {
1520   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1521   gcc_assert (r != NULL);
1522   return rtx_equal_p (x, r);
1523 }
1524
1525
1526 static rtx
1527 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1528 {
1529   if (can_create_pseudo_p ())
1530     return force_reg (mode, value);
1531   else
1532     {
1533       x = aarch64_emit_move (x, value);
1534       return x;
1535     }
1536 }
1537
1538
1539 static rtx
1540 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1541 {
1542   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1543     {
1544       rtx high;
1545       /* Load the full offset into a register.  This
1546          might be improvable in the future.  */
1547       high = GEN_INT (offset);
1548       offset = 0;
1549       high = aarch64_force_temporary (mode, temp, high);
1550       reg = aarch64_force_temporary (mode, temp,
1551                                      gen_rtx_PLUS (mode, high, reg));
1552     }
1553   return plus_constant (mode, reg, offset);
1554 }
1555
1556 static int
1557 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1558                                 machine_mode mode)
1559 {
1560   int i;
1561   unsigned HOST_WIDE_INT val, val2, mask;
1562   int one_match, zero_match;
1563   int num_insns;
1564
1565   val = INTVAL (imm);
1566
1567   if (aarch64_move_imm (val, mode))
1568     {
1569       if (generate)
1570         emit_insn (gen_rtx_SET (dest, imm));
1571       return 1;
1572     }
1573
1574   if ((val >> 32) == 0 || mode == SImode)
1575     {
1576       if (generate)
1577         {
1578           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1579           if (mode == SImode)
1580             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1581                                        GEN_INT ((val >> 16) & 0xffff)));
1582           else
1583             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1584                                        GEN_INT ((val >> 16) & 0xffff)));
1585         }
1586       return 2;
1587     }
1588
1589   /* Remaining cases are all for DImode.  */
1590
1591   mask = 0xffff;
1592   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1593     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1594   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1595     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1596
1597   if (zero_match != 2 && one_match != 2)
1598     {
1599       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1600          For a 64-bit bitmask try whether changing 16 bits to all ones or
1601          zeroes creates a valid bitmask.  To check any repeated bitmask,
1602          try using 16 bits from the other 32-bit half of val.  */
1603
1604       for (i = 0; i < 64; i += 16, mask <<= 16)
1605         {
1606           val2 = val & ~mask;
1607           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1608             break;
1609           val2 = val | mask;
1610           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1611             break;
1612           val2 = val2 & ~mask;
1613           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1614           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1615             break;
1616         }
1617       if (i != 64)
1618         {
1619           if (generate)
1620             {
1621               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1622               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1623                                          GEN_INT ((val >> i) & 0xffff)));
1624             }
1625           return 2;
1626         }
1627     }
1628
1629   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1630      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1631      otherwise skip zero bits.  */
1632
1633   num_insns = 1;
1634   mask = 0xffff;
1635   val2 = one_match > zero_match ? ~val : val;
1636   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1637
1638   if (generate)
1639     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1640                                            ? (val | ~(mask << i))
1641                                            : (val & (mask << i)))));
1642   for (i += 16; i < 64; i += 16)
1643     {
1644       if ((val2 & (mask << i)) == 0)
1645         continue;
1646       if (generate)
1647         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1648                                    GEN_INT ((val >> i) & 0xffff)));
1649       num_insns ++;
1650     }
1651
1652   return num_insns;
1653 }
1654
1655
1656 void
1657 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1658 {
1659   machine_mode mode = GET_MODE (dest);
1660
1661   gcc_assert (mode == SImode || mode == DImode);
1662
1663   /* Check on what type of symbol it is.  */
1664   if (GET_CODE (imm) == SYMBOL_REF
1665       || GET_CODE (imm) == LABEL_REF
1666       || GET_CODE (imm) == CONST)
1667     {
1668       rtx mem, base, offset;
1669       enum aarch64_symbol_type sty;
1670
1671       /* If we have (const (plus symbol offset)), separate out the offset
1672          before we start classifying the symbol.  */
1673       split_const (imm, &base, &offset);
1674
1675       sty = aarch64_classify_symbol (base, offset);
1676       switch (sty)
1677         {
1678         case SYMBOL_FORCE_TO_MEM:
1679           if (offset != const0_rtx
1680               && targetm.cannot_force_const_mem (mode, imm))
1681             {
1682               gcc_assert (can_create_pseudo_p ());
1683               base = aarch64_force_temporary (mode, dest, base);
1684               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1685               aarch64_emit_move (dest, base);
1686               return;
1687             }
1688
1689           mem = force_const_mem (ptr_mode, imm);
1690           gcc_assert (mem);
1691
1692           /* If we aren't generating PC relative literals, then
1693              we need to expand the literal pool access carefully.
1694              This is something that needs to be done in a number
1695              of places, so could well live as a separate function.  */
1696           if (aarch64_nopcrelative_literal_loads)
1697             {
1698               gcc_assert (can_create_pseudo_p ());
1699               base = gen_reg_rtx (ptr_mode);
1700               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1701               mem = gen_rtx_MEM (ptr_mode, base);
1702             }
1703
1704           if (mode != ptr_mode)
1705             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1706
1707           emit_insn (gen_rtx_SET (dest, mem));
1708
1709           return;
1710
1711         case SYMBOL_SMALL_TLSGD:
1712         case SYMBOL_SMALL_TLSDESC:
1713         case SYMBOL_SMALL_TLSIE:
1714         case SYMBOL_SMALL_GOT_28K:
1715         case SYMBOL_SMALL_GOT_4G:
1716         case SYMBOL_TINY_GOT:
1717         case SYMBOL_TINY_TLSIE:
1718           if (offset != const0_rtx)
1719             {
1720               gcc_assert(can_create_pseudo_p ());
1721               base = aarch64_force_temporary (mode, dest, base);
1722               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1723               aarch64_emit_move (dest, base);
1724               return;
1725             }
1726           /* FALLTHRU */
1727
1728         case SYMBOL_SMALL_ABSOLUTE:
1729         case SYMBOL_TINY_ABSOLUTE:
1730         case SYMBOL_TLSLE12:
1731         case SYMBOL_TLSLE24:
1732         case SYMBOL_TLSLE32:
1733         case SYMBOL_TLSLE48:
1734           aarch64_load_symref_appropriately (dest, imm, sty);
1735           return;
1736
1737         default:
1738           gcc_unreachable ();
1739         }
1740     }
1741
1742   if (!CONST_INT_P (imm))
1743     {
1744       if (GET_CODE (imm) == HIGH)
1745         emit_insn (gen_rtx_SET (dest, imm));
1746       else
1747         {
1748           rtx mem = force_const_mem (mode, imm);
1749           gcc_assert (mem);
1750           emit_insn (gen_rtx_SET (dest, mem));
1751         }
1752
1753       return;
1754     }
1755
1756   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1757 }
1758
1759 static bool
1760 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1761                                  tree exp ATTRIBUTE_UNUSED)
1762 {
1763   /* Currently, always true.  */
1764   return true;
1765 }
1766
1767 /* Implement TARGET_PASS_BY_REFERENCE.  */
1768
1769 static bool
1770 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1771                            machine_mode mode,
1772                            const_tree type,
1773                            bool named ATTRIBUTE_UNUSED)
1774 {
1775   HOST_WIDE_INT size;
1776   machine_mode dummymode;
1777   int nregs;
1778
1779   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1780   size = (mode == BLKmode && type)
1781     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1782
1783   /* Aggregates are passed by reference based on their size.  */
1784   if (type && AGGREGATE_TYPE_P (type))
1785     {
1786       size = int_size_in_bytes (type);
1787     }
1788
1789   /* Variable sized arguments are always returned by reference.  */
1790   if (size < 0)
1791     return true;
1792
1793   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1794   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1795                                                &dummymode, &nregs,
1796                                                NULL))
1797     return false;
1798
1799   /* Arguments which are variable sized or larger than 2 registers are
1800      passed by reference unless they are a homogenous floating point
1801      aggregate.  */
1802   return size > 2 * UNITS_PER_WORD;
1803 }
1804
1805 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1806 static bool
1807 aarch64_return_in_msb (const_tree valtype)
1808 {
1809   machine_mode dummy_mode;
1810   int dummy_int;
1811
1812   /* Never happens in little-endian mode.  */
1813   if (!BYTES_BIG_ENDIAN)
1814     return false;
1815
1816   /* Only composite types smaller than or equal to 16 bytes can
1817      be potentially returned in registers.  */
1818   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1819       || int_size_in_bytes (valtype) <= 0
1820       || int_size_in_bytes (valtype) > 16)
1821     return false;
1822
1823   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1824      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1825      is always passed/returned in the least significant bits of fp/simd
1826      register(s).  */
1827   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1828                                                &dummy_mode, &dummy_int, NULL))
1829     return false;
1830
1831   return true;
1832 }
1833
1834 /* Implement TARGET_FUNCTION_VALUE.
1835    Define how to find the value returned by a function.  */
1836
1837 static rtx
1838 aarch64_function_value (const_tree type, const_tree func,
1839                         bool outgoing ATTRIBUTE_UNUSED)
1840 {
1841   machine_mode mode;
1842   int unsignedp;
1843   int count;
1844   machine_mode ag_mode;
1845
1846   mode = TYPE_MODE (type);
1847   if (INTEGRAL_TYPE_P (type))
1848     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1849
1850   if (aarch64_return_in_msb (type))
1851     {
1852       HOST_WIDE_INT size = int_size_in_bytes (type);
1853
1854       if (size % UNITS_PER_WORD != 0)
1855         {
1856           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1857           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1858         }
1859     }
1860
1861   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1862                                                &ag_mode, &count, NULL))
1863     {
1864       if (!aarch64_composite_type_p (type, mode))
1865         {
1866           gcc_assert (count == 1 && mode == ag_mode);
1867           return gen_rtx_REG (mode, V0_REGNUM);
1868         }
1869       else
1870         {
1871           int i;
1872           rtx par;
1873
1874           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1875           for (i = 0; i < count; i++)
1876             {
1877               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1878               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1879                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1880               XVECEXP (par, 0, i) = tmp;
1881             }
1882           return par;
1883         }
1884     }
1885   else
1886     return gen_rtx_REG (mode, R0_REGNUM);
1887 }
1888
1889 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1890    Return true if REGNO is the number of a hard register in which the values
1891    of called function may come back.  */
1892
1893 static bool
1894 aarch64_function_value_regno_p (const unsigned int regno)
1895 {
1896   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1897      of 16-byte return values are: 128-bit integers and 16-byte small
1898      structures (excluding homogeneous floating-point aggregates).  */
1899   if (regno == R0_REGNUM || regno == R1_REGNUM)
1900     return true;
1901
1902   /* Up to four fp/simd registers can return a function value, e.g. a
1903      homogeneous floating-point aggregate having four members.  */
1904   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1905     return TARGET_FLOAT;
1906
1907   return false;
1908 }
1909
1910 /* Implement TARGET_RETURN_IN_MEMORY.
1911
1912    If the type T of the result of a function is such that
1913      void func (T arg)
1914    would require that arg be passed as a value in a register (or set of
1915    registers) according to the parameter passing rules, then the result
1916    is returned in the same registers as would be used for such an
1917    argument.  */
1918
1919 static bool
1920 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1921 {
1922   HOST_WIDE_INT size;
1923   machine_mode ag_mode;
1924   int count;
1925
1926   if (!AGGREGATE_TYPE_P (type)
1927       && TREE_CODE (type) != COMPLEX_TYPE
1928       && TREE_CODE (type) != VECTOR_TYPE)
1929     /* Simple scalar types always returned in registers.  */
1930     return false;
1931
1932   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1933                                                type,
1934                                                &ag_mode,
1935                                                &count,
1936                                                NULL))
1937     return false;
1938
1939   /* Types larger than 2 registers returned in memory.  */
1940   size = int_size_in_bytes (type);
1941   return (size < 0 || size > 2 * UNITS_PER_WORD);
1942 }
1943
1944 static bool
1945 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1946                                const_tree type, int *nregs)
1947 {
1948   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1949   return aarch64_vfp_is_call_or_return_candidate (mode,
1950                                                   type,
1951                                                   &pcum->aapcs_vfp_rmode,
1952                                                   nregs,
1953                                                   NULL);
1954 }
1955
1956 /* Given MODE and TYPE of a function argument, return the alignment in
1957    bits.  The idea is to suppress any stronger alignment requested by
1958    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1959    This is a helper function for local use only.  */
1960
1961 static unsigned int
1962 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1963 {
1964   unsigned int alignment;
1965
1966   if (type)
1967     {
1968       if (!integer_zerop (TYPE_SIZE (type)))
1969         {
1970           if (TYPE_MODE (type) == mode)
1971             alignment = TYPE_ALIGN (type);
1972           else
1973             alignment = GET_MODE_ALIGNMENT (mode);
1974         }
1975       else
1976         alignment = 0;
1977     }
1978   else
1979     alignment = GET_MODE_ALIGNMENT (mode);
1980
1981   return alignment;
1982 }
1983
1984 /* Layout a function argument according to the AAPCS64 rules.  The rule
1985    numbers refer to the rule numbers in the AAPCS64.  */
1986
1987 static void
1988 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1989                     const_tree type,
1990                     bool named ATTRIBUTE_UNUSED)
1991 {
1992   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1993   int ncrn, nvrn, nregs;
1994   bool allocate_ncrn, allocate_nvrn;
1995   HOST_WIDE_INT size;
1996
1997   /* We need to do this once per argument.  */
1998   if (pcum->aapcs_arg_processed)
1999     return;
2000
2001   pcum->aapcs_arg_processed = true;
2002
2003   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2004   size
2005     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2006                 UNITS_PER_WORD);
2007
2008   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2009   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2010                                                  mode,
2011                                                  type,
2012                                                  &nregs);
2013
2014   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2015      The following code thus handles passing by SIMD/FP registers first.  */
2016
2017   nvrn = pcum->aapcs_nvrn;
2018
2019   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2020      and homogenous short-vector aggregates (HVA).  */
2021   if (allocate_nvrn)
2022     {
2023       if (!TARGET_FLOAT)
2024         aarch64_err_no_fpadvsimd (mode, "argument");
2025
2026       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2027         {
2028           pcum->aapcs_nextnvrn = nvrn + nregs;
2029           if (!aarch64_composite_type_p (type, mode))
2030             {
2031               gcc_assert (nregs == 1);
2032               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2033             }
2034           else
2035             {
2036               rtx par;
2037               int i;
2038               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2039               for (i = 0; i < nregs; i++)
2040                 {
2041                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2042                                          V0_REGNUM + nvrn + i);
2043                   tmp = gen_rtx_EXPR_LIST
2044                     (VOIDmode, tmp,
2045                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2046                   XVECEXP (par, 0, i) = tmp;
2047                 }
2048               pcum->aapcs_reg = par;
2049             }
2050           return;
2051         }
2052       else
2053         {
2054           /* C.3 NSRN is set to 8.  */
2055           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2056           goto on_stack;
2057         }
2058     }
2059
2060   ncrn = pcum->aapcs_ncrn;
2061   nregs = size / UNITS_PER_WORD;
2062
2063   /* C6 - C9.  though the sign and zero extension semantics are
2064      handled elsewhere.  This is the case where the argument fits
2065      entirely general registers.  */
2066   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2067     {
2068       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2069
2070       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2071
2072       /* C.8 if the argument has an alignment of 16 then the NGRN is
2073          rounded up to the next even number.  */
2074       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2075         {
2076           ++ncrn;
2077           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2078         }
2079       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2080          A reg is still generated for it, but the caller should be smart
2081          enough not to use it.  */
2082       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2083         {
2084           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2085         }
2086       else
2087         {
2088           rtx par;
2089           int i;
2090
2091           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2092           for (i = 0; i < nregs; i++)
2093             {
2094               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2095               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2096                                        GEN_INT (i * UNITS_PER_WORD));
2097               XVECEXP (par, 0, i) = tmp;
2098             }
2099           pcum->aapcs_reg = par;
2100         }
2101
2102       pcum->aapcs_nextncrn = ncrn + nregs;
2103       return;
2104     }
2105
2106   /* C.11  */
2107   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2108
2109   /* The argument is passed on stack; record the needed number of words for
2110      this argument and align the total size if necessary.  */
2111 on_stack:
2112   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2113   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2114     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2115                                        16 / UNITS_PER_WORD);
2116   return;
2117 }
2118
2119 /* Implement TARGET_FUNCTION_ARG.  */
2120
2121 static rtx
2122 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2123                       const_tree type, bool named)
2124 {
2125   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2126   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2127
2128   if (mode == VOIDmode)
2129     return NULL_RTX;
2130
2131   aarch64_layout_arg (pcum_v, mode, type, named);
2132   return pcum->aapcs_reg;
2133 }
2134
2135 void
2136 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2137                            const_tree fntype ATTRIBUTE_UNUSED,
2138                            rtx libname ATTRIBUTE_UNUSED,
2139                            const_tree fndecl ATTRIBUTE_UNUSED,
2140                            unsigned n_named ATTRIBUTE_UNUSED)
2141 {
2142   pcum->aapcs_ncrn = 0;
2143   pcum->aapcs_nvrn = 0;
2144   pcum->aapcs_nextncrn = 0;
2145   pcum->aapcs_nextnvrn = 0;
2146   pcum->pcs_variant = ARM_PCS_AAPCS64;
2147   pcum->aapcs_reg = NULL_RTX;
2148   pcum->aapcs_arg_processed = false;
2149   pcum->aapcs_stack_words = 0;
2150   pcum->aapcs_stack_size = 0;
2151
2152   if (!TARGET_FLOAT
2153       && fndecl && TREE_PUBLIC (fndecl)
2154       && fntype && fntype != error_mark_node)
2155     {
2156       const_tree type = TREE_TYPE (fntype);
2157       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2158       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2159       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2160                                                    &mode, &nregs, NULL))
2161         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2162     }
2163   return;
2164 }
2165
2166 static void
2167 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2168                               machine_mode mode,
2169                               const_tree type,
2170                               bool named)
2171 {
2172   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2173   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2174     {
2175       aarch64_layout_arg (pcum_v, mode, type, named);
2176       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2177                   != (pcum->aapcs_stack_words != 0));
2178       pcum->aapcs_arg_processed = false;
2179       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2180       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2181       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2182       pcum->aapcs_stack_words = 0;
2183       pcum->aapcs_reg = NULL_RTX;
2184     }
2185 }
2186
2187 bool
2188 aarch64_function_arg_regno_p (unsigned regno)
2189 {
2190   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2191           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2192 }
2193
2194 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2195    PARM_BOUNDARY bits of alignment, but will be given anything up
2196    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2197    that both before and after the layout of each argument, the Next
2198    Stacked Argument Address (NSAA) will have a minimum alignment of
2199    8 bytes.  */
2200
2201 static unsigned int
2202 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2203 {
2204   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2205
2206   if (alignment < PARM_BOUNDARY)
2207     alignment = PARM_BOUNDARY;
2208   if (alignment > STACK_BOUNDARY)
2209     alignment = STACK_BOUNDARY;
2210   return alignment;
2211 }
2212
2213 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2214
2215    Return true if an argument passed on the stack should be padded upwards,
2216    i.e. if the least-significant byte of the stack slot has useful data.
2217
2218    Small aggregate types are placed in the lowest memory address.
2219
2220    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2221
2222 bool
2223 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2224 {
2225   /* On little-endian targets, the least significant byte of every stack
2226      argument is passed at the lowest byte address of the stack slot.  */
2227   if (!BYTES_BIG_ENDIAN)
2228     return true;
2229
2230   /* Otherwise, integral, floating-point and pointer types are padded downward:
2231      the least significant byte of a stack argument is passed at the highest
2232      byte address of the stack slot.  */
2233   if (type
2234       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2235          || POINTER_TYPE_P (type))
2236       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2237     return false;
2238
2239   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2240   return true;
2241 }
2242
2243 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2244
2245    It specifies padding for the last (may also be the only)
2246    element of a block move between registers and memory.  If
2247    assuming the block is in the memory, padding upward means that
2248    the last element is padded after its highest significant byte,
2249    while in downward padding, the last element is padded at the
2250    its least significant byte side.
2251
2252    Small aggregates and small complex types are always padded
2253    upwards.
2254
2255    We don't need to worry about homogeneous floating-point or
2256    short-vector aggregates; their move is not affected by the
2257    padding direction determined here.  Regardless of endianness,
2258    each element of such an aggregate is put in the least
2259    significant bits of a fp/simd register.
2260
2261    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2262    register has useful data, and return the opposite if the most
2263    significant byte does.  */
2264
2265 bool
2266 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2267                      bool first ATTRIBUTE_UNUSED)
2268 {
2269
2270   /* Small composite types are always padded upward.  */
2271   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2272     {
2273       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2274                             : GET_MODE_SIZE (mode));
2275       if (size < 2 * UNITS_PER_WORD)
2276         return true;
2277     }
2278
2279   /* Otherwise, use the default padding.  */
2280   return !BYTES_BIG_ENDIAN;
2281 }
2282
2283 static machine_mode
2284 aarch64_libgcc_cmp_return_mode (void)
2285 {
2286   return SImode;
2287 }
2288
2289 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2290
2291 /* We use the 12-bit shifted immediate arithmetic instructions so values
2292    must be multiple of (1 << 12), i.e. 4096.  */
2293 #define ARITH_FACTOR 4096
2294
2295 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2296 #error Cannot use simple address calculation for stack probing
2297 #endif
2298
2299 /* The pair of scratch registers used for stack probing.  */
2300 #define PROBE_STACK_FIRST_REG  9
2301 #define PROBE_STACK_SECOND_REG 10
2302
2303 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2304    inclusive.  These are offsets from the current stack pointer.  */
2305
2306 static void
2307 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2308 {
2309   rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2310
2311   /* See the same assertion on PROBE_INTERVAL above.  */
2312   gcc_assert ((first % ARITH_FACTOR) == 0);
2313
2314   /* See if we have a constant small number of probes to generate.  If so,
2315      that's the easy case.  */
2316   if (size <= PROBE_INTERVAL)
2317     {
2318       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2319
2320       emit_set_insn (reg1,
2321                      plus_constant (ptr_mode,
2322                                     stack_pointer_rtx, -(first + base)));
2323       emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2324     }
2325
2326   /* The run-time loop is made up of 8 insns in the generic case while the
2327      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2328   else if (size <= 4 * PROBE_INTERVAL)
2329     {
2330       HOST_WIDE_INT i, rem;
2331
2332       emit_set_insn (reg1,
2333                      plus_constant (ptr_mode,
2334                                     stack_pointer_rtx,
2335                                     -(first + PROBE_INTERVAL)));
2336       emit_stack_probe (reg1);
2337
2338       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2339          it exceeds SIZE.  If only two probes are needed, this will not
2340          generate any code.  Then probe at FIRST + SIZE.  */
2341       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2342         {
2343           emit_set_insn (reg1,
2344                          plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2345           emit_stack_probe (reg1);
2346         }
2347
2348       rem = size - (i - PROBE_INTERVAL);
2349       if (rem > 256)
2350         {
2351           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2352
2353           emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2354           emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2355         }
2356       else
2357         emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2358     }
2359
2360   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2361      extra careful with variables wrapping around because we might be at
2362      the very top (or the very bottom) of the address space and we have
2363      to be able to handle this case properly; in particular, we use an
2364      equality test for the loop condition.  */
2365   else
2366     {
2367       rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2368
2369       /* Step 1: round SIZE to the previous multiple of the interval.  */
2370
2371       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2372
2373
2374       /* Step 2: compute initial and final value of the loop counter.  */
2375
2376       /* TEST_ADDR = SP + FIRST.  */
2377       emit_set_insn (reg1,
2378                      plus_constant (ptr_mode, stack_pointer_rtx, -first));
2379
2380       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2381       emit_set_insn (reg2,
2382                      plus_constant (ptr_mode, stack_pointer_rtx,
2383                                     -(first + rounded_size)));
2384
2385
2386       /* Step 3: the loop
2387
2388          do
2389            {
2390              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2391              probe at TEST_ADDR
2392            }
2393          while (TEST_ADDR != LAST_ADDR)
2394
2395          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2396          until it is equal to ROUNDED_SIZE.  */
2397
2398       if (ptr_mode == DImode)
2399         emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2400       else
2401         emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2402
2403
2404       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2405          that SIZE is equal to ROUNDED_SIZE.  */
2406
2407       if (size != rounded_size)
2408         {
2409           HOST_WIDE_INT rem = size - rounded_size;
2410
2411           if (rem > 256)
2412             {
2413               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2414
2415               emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2416               emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2417             }
2418           else
2419             emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2420         }
2421     }
2422
2423   /* Make sure nothing is scheduled before we are done.  */
2424   emit_insn (gen_blockage ());
2425 }
2426
2427 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2428    absolute addresses.  */
2429
2430 const char *
2431 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2432 {
2433   static int labelno = 0;
2434   char loop_lab[32];
2435   rtx xops[2];
2436
2437   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2438
2439   /* Loop.  */
2440   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2441
2442   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2443   xops[0] = reg1;
2444   xops[1] = GEN_INT (PROBE_INTERVAL);
2445   output_asm_insn ("sub\t%0, %0, %1", xops);
2446
2447   /* Probe at TEST_ADDR.  */
2448   output_asm_insn ("str\txzr, [%0]", xops);
2449
2450   /* Test if TEST_ADDR == LAST_ADDR.  */
2451   xops[1] = reg2;
2452   output_asm_insn ("cmp\t%0, %1", xops);
2453
2454   /* Branch.  */
2455   fputs ("\tb.ne\t", asm_out_file);
2456   assemble_name_raw (asm_out_file, loop_lab);
2457   fputc ('\n', asm_out_file);
2458
2459   return "";
2460 }
2461
2462 static bool
2463 aarch64_frame_pointer_required (void)
2464 {
2465   /* In aarch64_override_options_after_change
2466      flag_omit_leaf_frame_pointer turns off the frame pointer by
2467      default.  Turn it back on now if we've not got a leaf
2468      function.  */
2469   if (flag_omit_leaf_frame_pointer
2470       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2471     return true;
2472
2473   return false;
2474 }
2475
2476 /* Mark the registers that need to be saved by the callee and calculate
2477    the size of the callee-saved registers area and frame record (both FP
2478    and LR may be omitted).  */
2479 static void
2480 aarch64_layout_frame (void)
2481 {
2482   HOST_WIDE_INT offset = 0;
2483   int regno;
2484
2485   if (reload_completed && cfun->machine->frame.laid_out)
2486     return;
2487
2488 #define SLOT_NOT_REQUIRED (-2)
2489 #define SLOT_REQUIRED     (-1)
2490
2491   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2492   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2493
2494   /* First mark all the registers that really need to be saved...  */
2495   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2496     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2497
2498   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2499     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2500
2501   /* ... that includes the eh data registers (if needed)...  */
2502   if (crtl->calls_eh_return)
2503     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2504       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2505         = SLOT_REQUIRED;
2506
2507   /* ... and any callee saved register that dataflow says is live.  */
2508   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2509     if (df_regs_ever_live_p (regno)
2510         && (regno == R30_REGNUM
2511             || !call_used_regs[regno]))
2512       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2513
2514   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2515     if (df_regs_ever_live_p (regno)
2516         && !call_used_regs[regno])
2517       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2518
2519   if (frame_pointer_needed)
2520     {
2521       /* FP and LR are placed in the linkage record.  */
2522       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2523       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2524       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2525       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2526       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2527       offset += 2 * UNITS_PER_WORD;
2528     }
2529
2530   /* Now assign stack slots for them.  */
2531   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2532     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2533       {
2534         cfun->machine->frame.reg_offset[regno] = offset;
2535         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2536           cfun->machine->frame.wb_candidate1 = regno;
2537         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2538           cfun->machine->frame.wb_candidate2 = regno;
2539         offset += UNITS_PER_WORD;
2540       }
2541
2542   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2543     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2544       {
2545         cfun->machine->frame.reg_offset[regno] = offset;
2546         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2547           cfun->machine->frame.wb_candidate1 = regno;
2548         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2549                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2550           cfun->machine->frame.wb_candidate2 = regno;
2551         offset += UNITS_PER_WORD;
2552       }
2553
2554   cfun->machine->frame.padding0 =
2555     (ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2556   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2557
2558   cfun->machine->frame.saved_regs_size = offset;
2559
2560   cfun->machine->frame.hard_fp_offset
2561     = ROUND_UP (cfun->machine->frame.saved_varargs_size
2562                 + get_frame_size ()
2563                 + cfun->machine->frame.saved_regs_size,
2564                 STACK_BOUNDARY / BITS_PER_UNIT);
2565
2566   cfun->machine->frame.frame_size
2567     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2568                 + crtl->outgoing_args_size,
2569                 STACK_BOUNDARY / BITS_PER_UNIT);
2570
2571   cfun->machine->frame.laid_out = true;
2572 }
2573
2574 static bool
2575 aarch64_register_saved_on_entry (int regno)
2576 {
2577   return cfun->machine->frame.reg_offset[regno] >= 0;
2578 }
2579
2580 static unsigned
2581 aarch64_next_callee_save (unsigned regno, unsigned limit)
2582 {
2583   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2584     regno ++;
2585   return regno;
2586 }
2587
2588 static void
2589 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2590                            HOST_WIDE_INT adjustment)
2591  {
2592   rtx base_rtx = stack_pointer_rtx;
2593   rtx insn, reg, mem;
2594
2595   reg = gen_rtx_REG (mode, regno);
2596   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2597                             plus_constant (Pmode, base_rtx, -adjustment));
2598   mem = gen_rtx_MEM (mode, mem);
2599
2600   insn = emit_move_insn (mem, reg);
2601   RTX_FRAME_RELATED_P (insn) = 1;
2602 }
2603
2604 static rtx
2605 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2606                           HOST_WIDE_INT adjustment)
2607 {
2608   switch (mode)
2609     {
2610     case DImode:
2611       return gen_storewb_pairdi_di (base, base, reg, reg2,
2612                                     GEN_INT (-adjustment),
2613                                     GEN_INT (UNITS_PER_WORD - adjustment));
2614     case DFmode:
2615       return gen_storewb_pairdf_di (base, base, reg, reg2,
2616                                     GEN_INT (-adjustment),
2617                                     GEN_INT (UNITS_PER_WORD - adjustment));
2618     default:
2619       gcc_unreachable ();
2620     }
2621 }
2622
2623 static void
2624 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2625                          unsigned regno2, HOST_WIDE_INT adjustment)
2626 {
2627   rtx_insn *insn;
2628   rtx reg1 = gen_rtx_REG (mode, regno1);
2629   rtx reg2 = gen_rtx_REG (mode, regno2);
2630
2631   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2632                                               reg2, adjustment));
2633   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2634   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2635   RTX_FRAME_RELATED_P (insn) = 1;
2636 }
2637
2638 static rtx
2639 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2640                          HOST_WIDE_INT adjustment)
2641 {
2642   switch (mode)
2643     {
2644     case DImode:
2645       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2646                                    GEN_INT (UNITS_PER_WORD));
2647     case DFmode:
2648       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2649                                    GEN_INT (UNITS_PER_WORD));
2650     default:
2651       gcc_unreachable ();
2652     }
2653 }
2654
2655 static rtx
2656 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2657                         rtx reg2)
2658 {
2659   switch (mode)
2660     {
2661     case DImode:
2662       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2663
2664     case DFmode:
2665       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2666
2667     default:
2668       gcc_unreachable ();
2669     }
2670 }
2671
2672 static rtx
2673 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2674                        rtx mem2)
2675 {
2676   switch (mode)
2677     {
2678     case DImode:
2679       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2680
2681     case DFmode:
2682       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2683
2684     default:
2685       gcc_unreachable ();
2686     }
2687 }
2688
2689
2690 static void
2691 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2692                            unsigned start, unsigned limit, bool skip_wb)
2693 {
2694   rtx_insn *insn;
2695   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2696                                                  ? gen_frame_mem : gen_rtx_MEM);
2697   unsigned regno;
2698   unsigned regno2;
2699
2700   for (regno = aarch64_next_callee_save (start, limit);
2701        regno <= limit;
2702        regno = aarch64_next_callee_save (regno + 1, limit))
2703     {
2704       rtx reg, mem;
2705       HOST_WIDE_INT offset;
2706
2707       if (skip_wb
2708           && (regno == cfun->machine->frame.wb_candidate1
2709               || regno == cfun->machine->frame.wb_candidate2))
2710         continue;
2711
2712       reg = gen_rtx_REG (mode, regno);
2713       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2714       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2715                                               offset));
2716
2717       regno2 = aarch64_next_callee_save (regno + 1, limit);
2718
2719       if (regno2 <= limit
2720           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2721               == cfun->machine->frame.reg_offset[regno2]))
2722
2723         {
2724           rtx reg2 = gen_rtx_REG (mode, regno2);
2725           rtx mem2;
2726
2727           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2728           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2729                                                    offset));
2730           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2731                                                     reg2));
2732
2733           /* The first part of a frame-related parallel insn is
2734              always assumed to be relevant to the frame
2735              calculations; subsequent parts, are only
2736              frame-related if explicitly marked.  */
2737           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2738           regno = regno2;
2739         }
2740       else
2741         insn = emit_move_insn (mem, reg);
2742
2743       RTX_FRAME_RELATED_P (insn) = 1;
2744     }
2745 }
2746
2747 static void
2748 aarch64_restore_callee_saves (machine_mode mode,
2749                               HOST_WIDE_INT start_offset, unsigned start,
2750                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2751 {
2752   rtx base_rtx = stack_pointer_rtx;
2753   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2754                                                  ? gen_frame_mem : gen_rtx_MEM);
2755   unsigned regno;
2756   unsigned regno2;
2757   HOST_WIDE_INT offset;
2758
2759   for (regno = aarch64_next_callee_save (start, limit);
2760        regno <= limit;
2761        regno = aarch64_next_callee_save (regno + 1, limit))
2762     {
2763       rtx reg, mem;
2764
2765       if (skip_wb
2766           && (regno == cfun->machine->frame.wb_candidate1
2767               || regno == cfun->machine->frame.wb_candidate2))
2768         continue;
2769
2770       reg = gen_rtx_REG (mode, regno);
2771       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2772       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2773
2774       regno2 = aarch64_next_callee_save (regno + 1, limit);
2775
2776       if (regno2 <= limit
2777           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2778               == cfun->machine->frame.reg_offset[regno2]))
2779         {
2780           rtx reg2 = gen_rtx_REG (mode, regno2);
2781           rtx mem2;
2782
2783           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2784           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2785           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2786
2787           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2788           regno = regno2;
2789         }
2790       else
2791         emit_move_insn (reg, mem);
2792       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2793     }
2794 }
2795
2796 /* AArch64 stack frames generated by this compiler look like:
2797
2798         +-------------------------------+
2799         |                               |
2800         |  incoming stack arguments     |
2801         |                               |
2802         +-------------------------------+
2803         |                               | <-- incoming stack pointer (aligned)
2804         |  callee-allocated save area   |
2805         |  for register varargs         |
2806         |                               |
2807         +-------------------------------+
2808         |  local variables              | <-- frame_pointer_rtx
2809         |                               |
2810         +-------------------------------+
2811         |  padding0                     | \
2812         +-------------------------------+  |
2813         |  callee-saved registers       |  | frame.saved_regs_size
2814         +-------------------------------+  |
2815         |  LR'                          |  |
2816         +-------------------------------+  |
2817         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2818         +-------------------------------+
2819         |  dynamic allocation           |
2820         +-------------------------------+
2821         |  padding                      |
2822         +-------------------------------+
2823         |  outgoing stack arguments     | <-- arg_pointer
2824         |                               |
2825         +-------------------------------+
2826         |                               | <-- stack_pointer_rtx (aligned)
2827
2828    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2829    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2830    unchanged.  */
2831
2832 /* Generate the prologue instructions for entry into a function.
2833    Establish the stack frame by decreasing the stack pointer with a
2834    properly calculated size and, if necessary, create a frame record
2835    filled with the values of LR and previous frame pointer.  The
2836    current FP is also set up if it is in use.  */
2837
2838 void
2839 aarch64_expand_prologue (void)
2840 {
2841   /* sub sp, sp, #<frame_size>
2842      stp {fp, lr}, [sp, #<frame_size> - 16]
2843      add fp, sp, #<frame_size> - hardfp_offset
2844      stp {cs_reg}, [fp, #-16] etc.
2845
2846      sub sp, sp, <final_adjustment_if_any>
2847   */
2848   HOST_WIDE_INT frame_size, offset;
2849   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2850   HOST_WIDE_INT hard_fp_offset;
2851   rtx_insn *insn;
2852
2853   aarch64_layout_frame ();
2854
2855   offset = frame_size = cfun->machine->frame.frame_size;
2856   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2857   fp_offset = frame_size - hard_fp_offset;
2858
2859   if (flag_stack_usage_info)
2860     current_function_static_stack_size = frame_size;
2861
2862   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
2863     {
2864       if (crtl->is_leaf && !cfun->calls_alloca)
2865         {
2866           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
2867             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
2868                                             frame_size - STACK_CHECK_PROTECT);
2869         }
2870       else if (frame_size > 0)
2871         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
2872     }
2873
2874   /* Store pairs and load pairs have a range only -512 to 504.  */
2875   if (offset >= 512)
2876     {
2877       /* When the frame has a large size, an initial decrease is done on
2878          the stack pointer to jump over the callee-allocated save area for
2879          register varargs, the local variable area and/or the callee-saved
2880          register area.  This will allow the pre-index write-back
2881          store pair instructions to be used for setting up the stack frame
2882          efficiently.  */
2883       offset = hard_fp_offset;
2884       if (offset >= 512)
2885         offset = cfun->machine->frame.saved_regs_size;
2886
2887       frame_size -= (offset + crtl->outgoing_args_size);
2888       fp_offset = 0;
2889
2890       if (frame_size >= 0x1000000)
2891         {
2892           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2893           emit_move_insn (op0, GEN_INT (-frame_size));
2894           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2895
2896           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2897                         gen_rtx_SET (stack_pointer_rtx,
2898                                      plus_constant (Pmode, stack_pointer_rtx,
2899                                                     -frame_size)));
2900           RTX_FRAME_RELATED_P (insn) = 1;
2901         }
2902       else if (frame_size > 0)
2903         {
2904           int hi_ofs = frame_size & 0xfff000;
2905           int lo_ofs = frame_size & 0x000fff;
2906
2907           if (hi_ofs)
2908             {
2909               insn = emit_insn (gen_add2_insn
2910                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2911               RTX_FRAME_RELATED_P (insn) = 1;
2912             }
2913           if (lo_ofs)
2914             {
2915               insn = emit_insn (gen_add2_insn
2916                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2917               RTX_FRAME_RELATED_P (insn) = 1;
2918             }
2919         }
2920     }
2921   else
2922     frame_size = -1;
2923
2924   if (offset > 0)
2925     {
2926       bool skip_wb = false;
2927
2928       if (frame_pointer_needed)
2929         {
2930           skip_wb = true;
2931
2932           if (fp_offset)
2933             {
2934               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2935                                                GEN_INT (-offset)));
2936               RTX_FRAME_RELATED_P (insn) = 1;
2937
2938               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2939                                          R30_REGNUM, false);
2940             }
2941           else
2942             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2943
2944           /* Set up frame pointer to point to the location of the
2945              previous frame pointer on the stack.  */
2946           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2947                                            stack_pointer_rtx,
2948                                            GEN_INT (fp_offset)));
2949           RTX_FRAME_RELATED_P (insn) = 1;
2950           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2951         }
2952       else
2953         {
2954           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2955           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2956
2957           if (fp_offset
2958               || reg1 == FIRST_PSEUDO_REGISTER
2959               || (reg2 == FIRST_PSEUDO_REGISTER
2960                   && offset >= 256))
2961             {
2962               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2963                                                GEN_INT (-offset)));
2964               RTX_FRAME_RELATED_P (insn) = 1;
2965             }
2966           else
2967             {
2968               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2969
2970               skip_wb = true;
2971
2972               if (reg2 == FIRST_PSEUDO_REGISTER)
2973                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2974               else
2975                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2976             }
2977         }
2978
2979       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2980                                  skip_wb);
2981       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2982                                  skip_wb);
2983     }
2984
2985   /* when offset >= 512,
2986      sub sp, sp, #<outgoing_args_size> */
2987   if (frame_size > -1)
2988     {
2989       if (crtl->outgoing_args_size > 0)
2990         {
2991           insn = emit_insn (gen_add2_insn
2992                             (stack_pointer_rtx,
2993                              GEN_INT (- crtl->outgoing_args_size)));
2994           RTX_FRAME_RELATED_P (insn) = 1;
2995         }
2996     }
2997 }
2998
2999 /* Return TRUE if we can use a simple_return insn.
3000
3001    This function checks whether the callee saved stack is empty, which
3002    means no restore actions are need. The pro_and_epilogue will use
3003    this to check whether shrink-wrapping opt is feasible.  */
3004
3005 bool
3006 aarch64_use_return_insn_p (void)
3007 {
3008   if (!reload_completed)
3009     return false;
3010
3011   if (crtl->profile)
3012     return false;
3013
3014   aarch64_layout_frame ();
3015
3016   return cfun->machine->frame.frame_size == 0;
3017 }
3018
3019 /* Generate the epilogue instructions for returning from a function.  */
3020 void
3021 aarch64_expand_epilogue (bool for_sibcall)
3022 {
3023   HOST_WIDE_INT frame_size, offset;
3024   HOST_WIDE_INT fp_offset;
3025   HOST_WIDE_INT hard_fp_offset;
3026   rtx_insn *insn;
3027   /* We need to add memory barrier to prevent read from deallocated stack.  */
3028   bool need_barrier_p = (get_frame_size () != 0
3029                          || cfun->machine->frame.saved_varargs_size);
3030
3031   aarch64_layout_frame ();
3032
3033   offset = frame_size = cfun->machine->frame.frame_size;
3034   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
3035   fp_offset = frame_size - hard_fp_offset;
3036
3037   /* Store pairs and load pairs have a range only -512 to 504.  */
3038   if (offset >= 512)
3039     {
3040       offset = hard_fp_offset;
3041       if (offset >= 512)
3042         offset = cfun->machine->frame.saved_regs_size;
3043
3044       frame_size -= (offset + crtl->outgoing_args_size);
3045       fp_offset = 0;
3046       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
3047         {
3048           insn = emit_insn (gen_add2_insn
3049                             (stack_pointer_rtx,
3050                              GEN_INT (crtl->outgoing_args_size)));
3051           RTX_FRAME_RELATED_P (insn) = 1;
3052         }
3053     }
3054   else
3055     frame_size = -1;
3056
3057   /* If there were outgoing arguments or we've done dynamic stack
3058      allocation, then restore the stack pointer from the frame
3059      pointer.  This is at most one insn and more efficient than using
3060      GCC's internal mechanism.  */
3061   if (frame_pointer_needed
3062       && (crtl->outgoing_args_size || cfun->calls_alloca))
3063     {
3064       if (cfun->calls_alloca)
3065         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3066
3067       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3068                                        hard_frame_pointer_rtx,
3069                                        GEN_INT (0)));
3070       offset = offset - fp_offset;
3071     }
3072
3073   if (offset > 0)
3074     {
3075       unsigned reg1 = cfun->machine->frame.wb_candidate1;
3076       unsigned reg2 = cfun->machine->frame.wb_candidate2;
3077       bool skip_wb = true;
3078       rtx cfi_ops = NULL;
3079
3080       if (frame_pointer_needed)
3081         fp_offset = 0;
3082       else if (fp_offset
3083                || reg1 == FIRST_PSEUDO_REGISTER
3084                || (reg2 == FIRST_PSEUDO_REGISTER
3085                    && offset >= 256))
3086         skip_wb = false;
3087
3088       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
3089                                     skip_wb, &cfi_ops);
3090       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
3091                                     skip_wb, &cfi_ops);
3092
3093       if (need_barrier_p)
3094         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3095
3096       if (skip_wb)
3097         {
3098           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
3099           rtx rreg1 = gen_rtx_REG (mode1, reg1);
3100
3101           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
3102           if (reg2 == FIRST_PSEUDO_REGISTER)
3103             {
3104               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
3105               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3106               mem = gen_rtx_MEM (mode1, mem);
3107               insn = emit_move_insn (rreg1, mem);
3108             }
3109           else
3110             {
3111               rtx rreg2 = gen_rtx_REG (mode1, reg2);
3112
3113               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
3114               insn = emit_insn (aarch64_gen_loadwb_pair
3115                                 (mode1, stack_pointer_rtx, rreg1,
3116                                  rreg2, offset));
3117             }
3118         }
3119       else
3120         {
3121           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
3122                                            GEN_INT (offset)));
3123         }
3124
3125       /* Reset the CFA to be SP + FRAME_SIZE.  */
3126       rtx new_cfa = stack_pointer_rtx;
3127       if (frame_size > 0)
3128         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
3129       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3130       REG_NOTES (insn) = cfi_ops;
3131       RTX_FRAME_RELATED_P (insn) = 1;
3132     }
3133
3134   if (frame_size > 0)
3135     {
3136       if (need_barrier_p)
3137         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3138
3139       if (frame_size >= 0x1000000)
3140         {
3141           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3142           emit_move_insn (op0, GEN_INT (frame_size));
3143           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
3144         }
3145       else
3146         {
3147           int hi_ofs = frame_size & 0xfff000;
3148           int lo_ofs = frame_size & 0x000fff;
3149
3150           if (hi_ofs && lo_ofs)
3151             {
3152               insn = emit_insn (gen_add2_insn
3153                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
3154               RTX_FRAME_RELATED_P (insn) = 1;
3155               frame_size = lo_ofs;
3156             }
3157           insn = emit_insn (gen_add2_insn
3158                             (stack_pointer_rtx, GEN_INT (frame_size)));
3159         }
3160
3161       /* Reset the CFA to be SP + 0.  */
3162       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
3163       RTX_FRAME_RELATED_P (insn) = 1;
3164     }
3165
3166   /* Stack adjustment for exception handler.  */
3167   if (crtl->calls_eh_return)
3168     {
3169       /* We need to unwind the stack by the offset computed by
3170          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3171          to be SP; letting the CFA move during this adjustment
3172          is just as correct as retaining the CFA from the body
3173          of the function.  Therefore, do nothing special.  */
3174       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3175     }
3176
3177   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3178   if (!for_sibcall)
3179     emit_jump_insn (ret_rtx);
3180 }
3181
3182 /* Return the place to copy the exception unwinding return address to.
3183    This will probably be a stack slot, but could (in theory be the
3184    return register).  */
3185 rtx
3186 aarch64_final_eh_return_addr (void)
3187 {
3188   HOST_WIDE_INT fp_offset;
3189
3190   aarch64_layout_frame ();
3191
3192   fp_offset = cfun->machine->frame.frame_size
3193               - cfun->machine->frame.hard_fp_offset;
3194
3195   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
3196     return gen_rtx_REG (DImode, LR_REGNUM);
3197
3198   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
3199      result in a store to save LR introduced by builtin_eh_return () being
3200      incorrectly deleted because the alias is not detected.
3201      So in the calculation of the address to copy the exception unwinding
3202      return address to, we note 2 cases.
3203      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
3204      we return a SP-relative location since all the addresses are SP-relative
3205      in this case.  This prevents the store from being optimized away.
3206      If the fp_offset is not 0, then the addresses will be FP-relative and
3207      therefore we return a FP-relative location.  */
3208
3209   if (frame_pointer_needed)
3210     {
3211       if (fp_offset)
3212         return gen_frame_mem (DImode,
3213                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3214       else
3215         return gen_frame_mem (DImode,
3216                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
3217     }
3218
3219   /* If FP is not needed, we calculate the location of LR, which would be
3220      at the top of the saved registers block.  */
3221
3222   return gen_frame_mem (DImode,
3223                         plus_constant (Pmode,
3224                                        stack_pointer_rtx,
3225                                        fp_offset
3226                                        + cfun->machine->frame.saved_regs_size
3227                                        - 2 * UNITS_PER_WORD));
3228 }
3229
3230 /* Possibly output code to build up a constant in a register.  For
3231    the benefit of the costs infrastructure, returns the number of
3232    instructions which would be emitted.  GENERATE inhibits or
3233    enables code generation.  */
3234
3235 static int
3236 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
3237 {
3238   int insns = 0;
3239
3240   if (aarch64_bitmask_imm (val, DImode))
3241     {
3242       if (generate)
3243         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
3244       insns = 1;
3245     }
3246   else
3247     {
3248       int i;
3249       int ncount = 0;
3250       int zcount = 0;
3251       HOST_WIDE_INT valp = val >> 16;
3252       HOST_WIDE_INT valm;
3253       HOST_WIDE_INT tval;
3254
3255       for (i = 16; i < 64; i += 16)
3256         {
3257           valm = (valp & 0xffff);
3258
3259           if (valm != 0)
3260             ++ zcount;
3261
3262           if (valm != 0xffff)
3263             ++ ncount;
3264
3265           valp >>= 16;
3266         }
3267
3268       /* zcount contains the number of additional MOVK instructions
3269          required if the constant is built up with an initial MOVZ instruction,
3270          while ncount is the number of MOVK instructions required if starting
3271          with a MOVN instruction.  Choose the sequence that yields the fewest
3272          number of instructions, preferring MOVZ instructions when they are both
3273          the same.  */
3274       if (ncount < zcount)
3275         {
3276           if (generate)
3277             emit_move_insn (gen_rtx_REG (Pmode, regnum),
3278                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
3279           tval = 0xffff;
3280           insns++;
3281         }
3282       else
3283         {
3284           if (generate)
3285             emit_move_insn (gen_rtx_REG (Pmode, regnum),
3286                             GEN_INT (val & 0xffff));
3287           tval = 0;
3288           insns++;
3289         }
3290
3291       val >>= 16;
3292
3293       for (i = 16; i < 64; i += 16)
3294         {
3295           if ((val & 0xffff) != tval)
3296             {
3297               if (generate)
3298                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
3299                                            GEN_INT (i),
3300                                            GEN_INT (val & 0xffff)));
3301               insns++;
3302             }
3303           val >>= 16;
3304         }
3305     }
3306   return insns;
3307 }
3308
3309 static void
3310 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
3311 {
3312   HOST_WIDE_INT mdelta = delta;
3313   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
3314   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
3315
3316   if (mdelta < 0)
3317     mdelta = -mdelta;
3318
3319   if (mdelta >= 4096 * 4096)
3320     {
3321       (void) aarch64_build_constant (scratchreg, delta, true);
3322       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
3323     }
3324   else if (mdelta > 0)
3325     {
3326       if (mdelta >= 4096)
3327         {
3328           emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
3329           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
3330           if (delta < 0)
3331             emit_insn (gen_rtx_SET (this_rtx,
3332                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
3333           else
3334             emit_insn (gen_rtx_SET (this_rtx,
3335                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
3336         }
3337       if (mdelta % 4096 != 0)
3338         {
3339           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
3340           emit_insn (gen_rtx_SET (this_rtx,
3341                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
3342         }
3343     }
3344 }
3345
3346 /* Output code to add DELTA to the first argument, and then jump
3347    to FUNCTION.  Used for C++ multiple inheritance.  */
3348 static void
3349 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3350                          HOST_WIDE_INT delta,
3351                          HOST_WIDE_INT vcall_offset,
3352                          tree function)
3353 {
3354   /* The this pointer is always in x0.  Note that this differs from
3355      Arm where the this pointer maybe bumped to r1 if r0 is required
3356      to return a pointer to an aggregate.  On AArch64 a result value
3357      pointer will be in x8.  */
3358   int this_regno = R0_REGNUM;
3359   rtx this_rtx, temp0, temp1, addr, funexp;
3360   rtx_insn *insn;
3361
3362   reload_completed = 1;
3363   emit_note (NOTE_INSN_PROLOGUE_END);
3364
3365   if (vcall_offset == 0)
3366     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3367   else
3368     {
3369       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3370
3371       this_rtx = gen_rtx_REG (Pmode, this_regno);
3372       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3373       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3374
3375       addr = this_rtx;
3376       if (delta != 0)
3377         {
3378           if (delta >= -256 && delta < 256)
3379             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3380                                        plus_constant (Pmode, this_rtx, delta));
3381           else
3382             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3383         }
3384
3385       if (Pmode == ptr_mode)
3386         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3387       else
3388         aarch64_emit_move (temp0,
3389                            gen_rtx_ZERO_EXTEND (Pmode,
3390                                                 gen_rtx_MEM (ptr_mode, addr)));
3391
3392       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3393           addr = plus_constant (Pmode, temp0, vcall_offset);
3394       else
3395         {
3396           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
3397           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3398         }
3399
3400       if (Pmode == ptr_mode)
3401         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3402       else
3403         aarch64_emit_move (temp1,
3404                            gen_rtx_SIGN_EXTEND (Pmode,
3405                                                 gen_rtx_MEM (ptr_mode, addr)));
3406
3407       emit_insn (gen_add2_insn (this_rtx, temp1));
3408     }
3409
3410   /* Generate a tail call to the target function.  */
3411   if (!TREE_USED (function))
3412     {
3413       assemble_external (function);
3414       TREE_USED (function) = 1;
3415     }
3416   funexp = XEXP (DECL_RTL (function), 0);
3417   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3418   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3419   SIBLING_CALL_P (insn) = 1;
3420
3421   insn = get_insns ();
3422   shorten_branches (insn);
3423   final_start_function (insn, file, 1);
3424   final (insn, file, 1);
3425   final_end_function ();
3426
3427   /* Stop pretending to be a post-reload pass.  */
3428   reload_completed = 0;
3429 }
3430
3431 static bool
3432 aarch64_tls_referenced_p (rtx x)
3433 {
3434   if (!TARGET_HAVE_TLS)
3435     return false;
3436   subrtx_iterator::array_type array;
3437   FOR_EACH_SUBRTX (iter, array, x, ALL)
3438     {
3439       const_rtx x = *iter;
3440       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3441         return true;
3442       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3443          TLS offsets, not real symbol references.  */
3444       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3445         iter.skip_subrtxes ();
3446     }
3447   return false;
3448 }
3449
3450
3451 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3452    a left shift of 0 or 12 bits.  */
3453 bool
3454 aarch64_uimm12_shift (HOST_WIDE_INT val)
3455 {
3456   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3457           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3458           );
3459 }
3460
3461
3462 /* Return true if val is an immediate that can be loaded into a
3463    register by a MOVZ instruction.  */
3464 static bool
3465 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3466 {
3467   if (GET_MODE_SIZE (mode) > 4)
3468     {
3469       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3470           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3471         return 1;
3472     }
3473   else
3474     {
3475       /* Ignore sign extension.  */
3476       val &= (HOST_WIDE_INT) 0xffffffff;
3477     }
3478   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3479           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3480 }
3481
3482 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3483
3484 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3485   {
3486     0x0000000100000001ull,
3487     0x0001000100010001ull,
3488     0x0101010101010101ull,
3489     0x1111111111111111ull,
3490     0x5555555555555555ull,
3491   };
3492
3493
3494 /* Return true if val is a valid bitmask immediate.  */
3495
3496 bool
3497 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3498 {
3499   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3500   int bits;
3501
3502   /* Check for a single sequence of one bits and return quickly if so.
3503      The special cases of all ones and all zeroes returns false.  */
3504   val = (unsigned HOST_WIDE_INT) val_in;
3505   tmp = val + (val & -val);
3506
3507   if (tmp == (tmp & -tmp))
3508     return (val + 1) > 1;
3509
3510   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
3511   if (mode == SImode)
3512     val = (val << 32) | (val & 0xffffffff);
3513
3514   /* Invert if the immediate doesn't start with a zero bit - this means we
3515      only need to search for sequences of one bits.  */
3516   if (val & 1)
3517     val = ~val;
3518
3519   /* Find the first set bit and set tmp to val with the first sequence of one
3520      bits removed.  Return success if there is a single sequence of ones.  */
3521   first_one = val & -val;
3522   tmp = val & (val + first_one);
3523
3524   if (tmp == 0)
3525     return true;
3526
3527   /* Find the next set bit and compute the difference in bit position.  */
3528   next_one = tmp & -tmp;
3529   bits = clz_hwi (first_one) - clz_hwi (next_one);
3530   mask = val ^ tmp;
3531
3532   /* Check the bit position difference is a power of 2, and that the first
3533      sequence of one bits fits within 'bits' bits.  */
3534   if ((mask >> bits) != 0 || bits != (bits & -bits))
3535     return false;
3536
3537   /* Check the sequence of one bits is repeated 64/bits times.  */
3538   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3539 }
3540
3541
3542 /* Return true if val is an immediate that can be loaded into a
3543    register in a single instruction.  */
3544 bool
3545 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3546 {
3547   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3548     return 1;
3549   return aarch64_bitmask_imm (val, mode);
3550 }
3551
3552 static bool
3553 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3554 {
3555   rtx base, offset;
3556
3557   if (GET_CODE (x) == HIGH)
3558     return true;
3559
3560   split_const (x, &base, &offset);
3561   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3562     {
3563       if (aarch64_classify_symbol (base, offset)
3564           != SYMBOL_FORCE_TO_MEM)
3565         return true;
3566       else
3567         /* Avoid generating a 64-bit relocation in ILP32; leave
3568            to aarch64_expand_mov_immediate to handle it properly.  */
3569         return mode != ptr_mode;
3570     }
3571
3572   return aarch64_tls_referenced_p (x);
3573 }
3574
3575 /* Implement TARGET_CASE_VALUES_THRESHOLD.
3576    The expansion for a table switch is quite expensive due to the number
3577    of instructions, the table lookup and hard to predict indirect jump.
3578    When optimizing for speed, and -O3 enabled, use the per-core tuning if
3579    set, otherwise use tables for > 16 cases as a tradeoff between size and
3580    performance.  When optimizing for size, use the default setting.  */
3581
3582 static unsigned int
3583 aarch64_case_values_threshold (void)
3584 {
3585   /* Use the specified limit for the number of cases before using jump
3586      tables at higher optimization levels.  */
3587   if (optimize > 2
3588       && selected_cpu->tune->max_case_values != 0)
3589     return selected_cpu->tune->max_case_values;
3590   else
3591     return optimize_size ? default_case_values_threshold () : 17;
3592 }
3593
3594 /* Return true if register REGNO is a valid index register.
3595    STRICT_P is true if REG_OK_STRICT is in effect.  */
3596
3597 bool
3598 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3599 {
3600   if (!HARD_REGISTER_NUM_P (regno))
3601     {
3602       if (!strict_p)
3603         return true;
3604
3605       if (!reg_renumber)
3606         return false;
3607
3608       regno = reg_renumber[regno];
3609     }
3610   return GP_REGNUM_P (regno);
3611 }
3612
3613 /* Return true if register REGNO is a valid base register for mode MODE.
3614    STRICT_P is true if REG_OK_STRICT is in effect.  */
3615
3616 bool
3617 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3618 {
3619   if (!HARD_REGISTER_NUM_P (regno))
3620     {
3621       if (!strict_p)
3622         return true;
3623
3624       if (!reg_renumber)
3625         return false;
3626
3627       regno = reg_renumber[regno];
3628     }
3629
3630   /* The fake registers will be eliminated to either the stack or
3631      hard frame pointer, both of which are usually valid base registers.
3632      Reload deals with the cases where the eliminated form isn't valid.  */
3633   return (GP_REGNUM_P (regno)
3634           || regno == SP_REGNUM
3635           || regno == FRAME_POINTER_REGNUM
3636           || regno == ARG_POINTER_REGNUM);
3637 }
3638
3639 /* Return true if X is a valid base register for mode MODE.
3640    STRICT_P is true if REG_OK_STRICT is in effect.  */
3641
3642 static bool
3643 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3644 {
3645   if (!strict_p && GET_CODE (x) == SUBREG)
3646     x = SUBREG_REG (x);
3647
3648   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3649 }
3650
3651 /* Return true if address offset is a valid index.  If it is, fill in INFO
3652    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3653
3654 static bool
3655 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3656                         machine_mode mode, bool strict_p)
3657 {
3658   enum aarch64_address_type type;
3659   rtx index;
3660   int shift;
3661
3662   /* (reg:P) */
3663   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3664       && GET_MODE (x) == Pmode)
3665     {
3666       type = ADDRESS_REG_REG;
3667       index = x;
3668       shift = 0;
3669     }
3670   /* (sign_extend:DI (reg:SI)) */
3671   else if ((GET_CODE (x) == SIGN_EXTEND
3672             || GET_CODE (x) == ZERO_EXTEND)
3673            && GET_MODE (x) == DImode
3674            && GET_MODE (XEXP (x, 0)) == SImode)
3675     {
3676       type = (GET_CODE (x) == SIGN_EXTEND)
3677         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3678       index = XEXP (x, 0);
3679       shift = 0;
3680     }
3681   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3682   else if (GET_CODE (x) == MULT
3683            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3684                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3685            && GET_MODE (XEXP (x, 0)) == DImode
3686            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3687            && CONST_INT_P (XEXP (x, 1)))
3688     {
3689       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3690         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3691       index = XEXP (XEXP (x, 0), 0);
3692       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3693     }
3694   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3695   else if (GET_CODE (x) == ASHIFT
3696            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3697                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3698            && GET_MODE (XEXP (x, 0)) == DImode
3699            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3700            && CONST_INT_P (XEXP (x, 1)))
3701     {
3702       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3703         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3704       index = XEXP (XEXP (x, 0), 0);
3705       shift = INTVAL (XEXP (x, 1));
3706     }
3707   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3708   else if ((GET_CODE (x) == SIGN_EXTRACT
3709             || GET_CODE (x) == ZERO_EXTRACT)
3710            && GET_MODE (x) == DImode
3711            && GET_CODE (XEXP (x, 0)) == MULT
3712            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3713            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3714     {
3715       type = (GET_CODE (x) == SIGN_EXTRACT)
3716         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3717       index = XEXP (XEXP (x, 0), 0);
3718       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3719       if (INTVAL (XEXP (x, 1)) != 32 + shift
3720           || INTVAL (XEXP (x, 2)) != 0)
3721         shift = -1;
3722     }
3723   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3724      (const_int 0xffffffff<<shift)) */
3725   else if (GET_CODE (x) == AND
3726            && GET_MODE (x) == DImode
3727            && GET_CODE (XEXP (x, 0)) == MULT
3728            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3729            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3730            && CONST_INT_P (XEXP (x, 1)))
3731     {
3732       type = ADDRESS_REG_UXTW;
3733       index = XEXP (XEXP (x, 0), 0);
3734       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3735       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3736         shift = -1;
3737     }
3738   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3739   else if ((GET_CODE (x) == SIGN_EXTRACT
3740             || GET_CODE (x) == ZERO_EXTRACT)
3741            && GET_MODE (x) == DImode
3742            && GET_CODE (XEXP (x, 0)) == ASHIFT
3743            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3744            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3745     {
3746       type = (GET_CODE (x) == SIGN_EXTRACT)
3747         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3748       index = XEXP (XEXP (x, 0), 0);
3749       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3750       if (INTVAL (XEXP (x, 1)) != 32 + shift
3751           || INTVAL (XEXP (x, 2)) != 0)
3752         shift = -1;
3753     }
3754   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3755      (const_int 0xffffffff<<shift)) */
3756   else if (GET_CODE (x) == AND
3757            && GET_MODE (x) == DImode
3758            && GET_CODE (XEXP (x, 0)) == ASHIFT
3759            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3760            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3761            && CONST_INT_P (XEXP (x, 1)))
3762     {
3763       type = ADDRESS_REG_UXTW;
3764       index = XEXP (XEXP (x, 0), 0);
3765       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3766       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3767         shift = -1;
3768     }
3769   /* (mult:P (reg:P) (const_int scale)) */
3770   else if (GET_CODE (x) == MULT
3771            && GET_MODE (x) == Pmode
3772            && GET_MODE (XEXP (x, 0)) == Pmode
3773            && CONST_INT_P (XEXP (x, 1)))
3774     {
3775       type = ADDRESS_REG_REG;
3776       index = XEXP (x, 0);
3777       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3778     }
3779   /* (ashift:P (reg:P) (const_int shift)) */
3780   else if (GET_CODE (x) == ASHIFT
3781            && GET_MODE (x) == Pmode
3782            && GET_MODE (XEXP (x, 0)) == Pmode
3783            && CONST_INT_P (XEXP (x, 1)))
3784     {
3785       type = ADDRESS_REG_REG;
3786       index = XEXP (x, 0);
3787       shift = INTVAL (XEXP (x, 1));
3788     }
3789   else
3790     return false;
3791
3792   if (GET_CODE (index) == SUBREG)
3793     index = SUBREG_REG (index);
3794
3795   if ((shift == 0 ||
3796        (shift > 0 && shift <= 3
3797         && (1 << shift) == GET_MODE_SIZE (mode)))
3798       && REG_P (index)
3799       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3800     {
3801       info->type = type;
3802       info->offset = index;
3803       info->shift = shift;
3804       return true;
3805     }
3806
3807   return false;
3808 }
3809
3810 bool
3811 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3812 {
3813   return (offset >= -64 * GET_MODE_SIZE (mode)
3814           && offset < 64 * GET_MODE_SIZE (mode)
3815           && offset % GET_MODE_SIZE (mode) == 0);
3816 }
3817
3818 static inline bool
3819 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3820                                HOST_WIDE_INT offset)
3821 {
3822   return offset >= -256 && offset < 256;
3823 }
3824
3825 static inline bool
3826 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3827 {
3828   return (offset >= 0
3829           && offset < 4096 * GET_MODE_SIZE (mode)
3830           && offset % GET_MODE_SIZE (mode) == 0);
3831 }
3832
3833 /* Return true if MODE is one of the modes for which we
3834    support LDP/STP operations.  */
3835
3836 static bool
3837 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
3838 {
3839   return mode == SImode || mode == DImode
3840          || mode == SFmode || mode == DFmode
3841          || (aarch64_vector_mode_supported_p (mode)
3842              && GET_MODE_SIZE (mode) == 8);
3843 }
3844
3845 /* Return true if REGNO is a virtual pointer register, or an eliminable
3846    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
3847    include stack_pointer or hard_frame_pointer.  */
3848 static bool
3849 virt_or_elim_regno_p (unsigned regno)
3850 {
3851   return ((regno >= FIRST_VIRTUAL_REGISTER
3852            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
3853           || regno == FRAME_POINTER_REGNUM
3854           || regno == ARG_POINTER_REGNUM);
3855 }
3856
3857 /* Return true if X is a valid address for machine mode MODE.  If it is,
3858    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3859    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3860
3861 static bool
3862 aarch64_classify_address (struct aarch64_address_info *info,
3863                           rtx x, machine_mode mode,
3864                           RTX_CODE outer_code, bool strict_p)
3865 {
3866   enum rtx_code code = GET_CODE (x);
3867   rtx op0, op1;
3868
3869   /* On BE, we use load/store pair for all large int mode load/stores.  */
3870   bool load_store_pair_p = (outer_code == PARALLEL
3871                             || (BYTES_BIG_ENDIAN
3872                                 && aarch64_vect_struct_mode_p (mode)));
3873
3874   bool allow_reg_index_p =
3875     !load_store_pair_p
3876     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3877     && !aarch64_vect_struct_mode_p (mode);
3878
3879   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3880      REG addressing.  */
3881   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3882       && (code != POST_INC && code != REG))
3883     return false;
3884
3885   switch (code)
3886     {
3887     case REG:
3888     case SUBREG:
3889       info->type = ADDRESS_REG_IMM;
3890       info->base = x;
3891       info->offset = const0_rtx;
3892       return aarch64_base_register_rtx_p (x, strict_p);
3893
3894     case PLUS:
3895       op0 = XEXP (x, 0);
3896       op1 = XEXP (x, 1);
3897
3898       if (! strict_p
3899           && REG_P (op0)
3900           && virt_or_elim_regno_p (REGNO (op0))
3901           && CONST_INT_P (op1))
3902         {
3903           info->type = ADDRESS_REG_IMM;
3904           info->base = op0;
3905           info->offset = op1;
3906
3907           return true;
3908         }
3909
3910       if (GET_MODE_SIZE (mode) != 0
3911           && CONST_INT_P (op1)
3912           && aarch64_base_register_rtx_p (op0, strict_p))
3913         {
3914           HOST_WIDE_INT offset = INTVAL (op1);
3915
3916           info->type = ADDRESS_REG_IMM;
3917           info->base = op0;
3918           info->offset = op1;
3919
3920           /* TImode and TFmode values are allowed in both pairs of X
3921              registers and individual Q registers.  The available
3922              address modes are:
3923              X,X: 7-bit signed scaled offset
3924              Q:   9-bit signed offset
3925              We conservatively require an offset representable in either mode.
3926            */
3927           if (mode == TImode || mode == TFmode)
3928             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3929                     && offset_9bit_signed_unscaled_p (mode, offset));
3930
3931           /* A 7bit offset check because OImode will emit a ldp/stp
3932              instruction (only big endian will get here).
3933              For ldp/stp instructions, the offset is scaled for the size of a
3934              single element of the pair.  */
3935           if (mode == OImode)
3936             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3937
3938           /* Three 9/12 bit offsets checks because CImode will emit three
3939              ldr/str instructions (only big endian will get here).  */
3940           if (mode == CImode)
3941             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3942                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3943                         || offset_12bit_unsigned_scaled_p (V16QImode,
3944                                                            offset + 32)));
3945
3946           /* Two 7bit offsets checks because XImode will emit two ldp/stp
3947              instructions (only big endian will get here).  */
3948           if (mode == XImode)
3949             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3950                     && aarch64_offset_7bit_signed_scaled_p (TImode,
3951                                                             offset + 32));
3952
3953           if (load_store_pair_p)
3954             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3955                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3956           else
3957             return (offset_9bit_signed_unscaled_p (mode, offset)
3958                     || offset_12bit_unsigned_scaled_p (mode, offset));
3959         }
3960
3961       if (allow_reg_index_p)
3962         {
3963           /* Look for base + (scaled/extended) index register.  */
3964           if (aarch64_base_register_rtx_p (op0, strict_p)
3965               && aarch64_classify_index (info, op1, mode, strict_p))
3966             {
3967               info->base = op0;
3968               return true;
3969             }
3970           if (aarch64_base_register_rtx_p (op1, strict_p)
3971               && aarch64_classify_index (info, op0, mode, strict_p))
3972             {
3973               info->base = op1;
3974               return true;
3975             }
3976         }
3977
3978       return false;
3979
3980     case POST_INC:
3981     case POST_DEC:
3982     case PRE_INC:
3983     case PRE_DEC:
3984       info->type = ADDRESS_REG_WB;
3985       info->base = XEXP (x, 0);
3986       info->offset = NULL_RTX;
3987       return aarch64_base_register_rtx_p (info->base, strict_p);
3988
3989     case POST_MODIFY:
3990     case PRE_MODIFY:
3991       info->type = ADDRESS_REG_WB;
3992       info->base = XEXP (x, 0);
3993       if (GET_CODE (XEXP (x, 1)) == PLUS
3994           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3995           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3996           && aarch64_base_register_rtx_p (info->base, strict_p))
3997         {
3998           HOST_WIDE_INT offset;
3999           info->offset = XEXP (XEXP (x, 1), 1);
4000           offset = INTVAL (info->offset);
4001
4002           /* TImode and TFmode values are allowed in both pairs of X
4003              registers and individual Q registers.  The available
4004              address modes are:
4005              X,X: 7-bit signed scaled offset
4006              Q:   9-bit signed offset
4007              We conservatively require an offset representable in either mode.
4008            */
4009           if (mode == TImode || mode == TFmode)
4010             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4011                     && offset_9bit_signed_unscaled_p (mode, offset));
4012
4013           if (load_store_pair_p)
4014             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4015                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4016           else
4017             return offset_9bit_signed_unscaled_p (mode, offset);
4018         }
4019       return false;
4020
4021     case CONST:
4022     case SYMBOL_REF:
4023     case LABEL_REF:
4024       /* load literal: pc-relative constant pool entry.  Only supported
4025          for SI mode or larger.  */
4026       info->type = ADDRESS_SYMBOLIC;
4027
4028       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4029         {
4030           rtx sym, addend;
4031
4032           split_const (x, &sym, &addend);
4033           return ((GET_CODE (sym) == LABEL_REF
4034                    || (GET_CODE (sym) == SYMBOL_REF
4035                        && CONSTANT_POOL_ADDRESS_P (sym)
4036                        && !aarch64_nopcrelative_literal_loads)));
4037         }
4038       return false;
4039
4040     case LO_SUM:
4041       info->type = ADDRESS_LO_SUM;
4042       info->base = XEXP (x, 0);
4043       info->offset = XEXP (x, 1);
4044       if (allow_reg_index_p
4045           && aarch64_base_register_rtx_p (info->base, strict_p))
4046         {
4047           rtx sym, offs;
4048           split_const (info->offset, &sym, &offs);
4049           if (GET_CODE (sym) == SYMBOL_REF
4050               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4051             {
4052               /* The symbol and offset must be aligned to the access size.  */
4053               unsigned int align;
4054               unsigned int ref_size;
4055
4056               if (CONSTANT_POOL_ADDRESS_P (sym))
4057                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4058               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4059                 {
4060                   tree exp = SYMBOL_REF_DECL (sym);
4061                   align = TYPE_ALIGN (TREE_TYPE (exp));
4062                   align = CONSTANT_ALIGNMENT (exp, align);
4063                 }
4064               else if (SYMBOL_REF_DECL (sym))
4065                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4066               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4067                        && SYMBOL_REF_BLOCK (sym) != NULL)
4068                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4069               else
4070                 align = BITS_PER_UNIT;
4071
4072               ref_size = GET_MODE_SIZE (mode);
4073               if (ref_size == 0)
4074                 ref_size = GET_MODE_SIZE (DImode);
4075
4076               return ((INTVAL (offs) & (ref_size - 1)) == 0
4077                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4078             }
4079         }
4080       return false;
4081
4082     default:
4083       return false;
4084     }
4085 }
4086
4087 bool
4088 aarch64_symbolic_address_p (rtx x)
4089 {
4090   rtx offset;
4091
4092   split_const (x, &x, &offset);
4093   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4094 }
4095
4096 /* Classify the base of symbolic expression X.  */
4097
4098 enum aarch64_symbol_type
4099 aarch64_classify_symbolic_expression (rtx x)
4100 {
4101   rtx offset;
4102
4103   split_const (x, &x, &offset);
4104   return aarch64_classify_symbol (x, offset);
4105 }
4106
4107
4108 /* Return TRUE if X is a legitimate address for accessing memory in
4109    mode MODE.  */
4110 static bool
4111 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4112 {
4113   struct aarch64_address_info addr;
4114
4115   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4116 }
4117
4118 /* Return TRUE if X is a legitimate address for accessing memory in
4119    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4120    pair operation.  */
4121 bool
4122 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4123                               RTX_CODE outer_code, bool strict_p)
4124 {
4125   struct aarch64_address_info addr;
4126
4127   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4128 }
4129
4130 /* Return TRUE if rtx X is immediate constant 0.0 */
4131 bool
4132 aarch64_float_const_zero_rtx_p (rtx x)
4133 {
4134   if (GET_MODE (x) == VOIDmode)
4135     return false;
4136
4137   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4138     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4139   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4140 }
4141
4142 /* Return the fixed registers used for condition codes.  */
4143
4144 static bool
4145 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4146 {
4147   *p1 = CC_REGNUM;
4148   *p2 = INVALID_REGNUM;
4149   return true;
4150 }
4151
4152 /* Emit call insn with PAT and do aarch64-specific handling.  */
4153
4154 void
4155 aarch64_emit_call_insn (rtx pat)
4156 {
4157   rtx insn = emit_call_insn (pat);
4158
4159   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4160   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4161   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4162 }
4163
4164 machine_mode
4165 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4166 {
4167   /* All floating point compares return CCFP if it is an equality
4168      comparison, and CCFPE otherwise.  */
4169   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4170     {
4171       switch (code)
4172         {
4173         case EQ:
4174         case NE:
4175         case UNORDERED:
4176         case ORDERED:
4177         case UNLT:
4178         case UNLE:
4179         case UNGT:
4180         case UNGE:
4181         case UNEQ:
4182         case LTGT:
4183           return CCFPmode;
4184
4185         case LT:
4186         case LE:
4187         case GT:
4188         case GE:
4189           return CCFPEmode;
4190
4191         default:
4192           gcc_unreachable ();
4193         }
4194     }
4195
4196   /* Equality comparisons of short modes against zero can be performed
4197      using the TST instruction with the appropriate bitmask.  */
4198   if (y == const0_rtx && REG_P (x)
4199       && (code == EQ || code == NE)
4200       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4201     return CC_NZmode;
4202
4203   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4204       && y == const0_rtx
4205       && (code == EQ || code == NE || code == LT || code == GE)
4206       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4207           || GET_CODE (x) == NEG
4208           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4209               && CONST_INT_P (XEXP (x, 2)))))
4210     return CC_NZmode;
4211
4212   /* A compare with a shifted operand.  Because of canonicalization,
4213      the comparison will have to be swapped when we emit the assembly
4214      code.  */
4215   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4216       && (REG_P (y) || GET_CODE (y) == SUBREG)
4217       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4218           || GET_CODE (x) == LSHIFTRT
4219           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4220     return CC_SWPmode;
4221
4222   /* Similarly for a negated operand, but we can only do this for
4223      equalities.  */
4224   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4225       && (REG_P (y) || GET_CODE (y) == SUBREG)
4226       && (code == EQ || code == NE)
4227       && GET_CODE (x) == NEG)
4228     return CC_Zmode;
4229
4230   /* A test for unsigned overflow.  */
4231   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4232       && code == NE
4233       && GET_CODE (x) == PLUS
4234       && GET_CODE (y) == ZERO_EXTEND)
4235     return CC_Cmode;
4236
4237   /* For everything else, return CCmode.  */
4238   return CCmode;
4239 }
4240
4241 static int
4242 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4243
4244 int
4245 aarch64_get_condition_code (rtx x)
4246 {
4247   machine_mode mode = GET_MODE (XEXP (x, 0));
4248   enum rtx_code comp_code = GET_CODE (x);
4249
4250   if (GET_MODE_CLASS (mode) != MODE_CC)
4251     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4252   return aarch64_get_condition_code_1 (mode, comp_code);
4253 }
4254
4255 static int
4256 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4257 {
4258   switch (mode)
4259     {
4260     case CCFPmode:
4261     case CCFPEmode:
4262       switch (comp_code)
4263         {
4264         case GE: return AARCH64_GE;
4265         case GT: return AARCH64_GT;
4266         case LE: return AARCH64_LS;
4267         case LT: return AARCH64_MI;
4268         case NE: return AARCH64_NE;
4269         case EQ: return AARCH64_EQ;
4270         case ORDERED: return AARCH64_VC;
4271         case UNORDERED: return AARCH64_VS;
4272         case UNLT: return AARCH64_LT;
4273         case UNLE: return AARCH64_LE;
4274         case UNGT: return AARCH64_HI;
4275         case UNGE: return AARCH64_PL;
4276         default: return -1;
4277         }
4278       break;
4279
4280     case CCmode:
4281       switch (comp_code)
4282         {
4283         case NE: return AARCH64_NE;
4284         case EQ: return AARCH64_EQ;
4285         case GE: return AARCH64_GE;
4286         case GT: return AARCH64_GT;
4287         case LE: return AARCH64_LE;
4288         case LT: return AARCH64_LT;
4289         case GEU: return AARCH64_CS;
4290         case GTU: return AARCH64_HI;
4291         case LEU: return AARCH64_LS;
4292         case LTU: return AARCH64_CC;
4293         default: return -1;
4294         }
4295       break;
4296
4297     case CC_SWPmode:
4298       switch (comp_code)
4299         {
4300         case NE: return AARCH64_NE;
4301         case EQ: return AARCH64_EQ;
4302         case GE: return AARCH64_LE;
4303         case GT: return AARCH64_LT;
4304         case LE: return AARCH64_GE;
4305         case LT: return AARCH64_GT;
4306         case GEU: return AARCH64_LS;
4307         case GTU: return AARCH64_CC;
4308         case LEU: return AARCH64_CS;
4309         case LTU: return AARCH64_HI;
4310         default: return -1;
4311         }
4312       break;
4313
4314     case CC_NZmode:
4315       switch (comp_code)
4316         {
4317         case NE: return AARCH64_NE;
4318         case EQ: return AARCH64_EQ;
4319         case GE: return AARCH64_PL;
4320         case LT: return AARCH64_MI;
4321         default: return -1;
4322         }
4323       break;
4324
4325     case CC_Zmode:
4326       switch (comp_code)
4327         {
4328         case NE: return AARCH64_NE;
4329         case EQ: return AARCH64_EQ;
4330         default: return -1;
4331         }
4332       break;
4333
4334     case CC_Cmode:
4335       switch (comp_code)
4336         {
4337         case NE: return AARCH64_CS;
4338         case EQ: return AARCH64_CC;
4339         default: return -1;
4340         }
4341       break;
4342
4343     default:
4344       return -1;
4345       break;
4346     }
4347
4348   return -1;
4349 }
4350
4351 bool
4352 aarch64_const_vec_all_same_in_range_p (rtx x,
4353                                   HOST_WIDE_INT minval,
4354                                   HOST_WIDE_INT maxval)
4355 {
4356   HOST_WIDE_INT firstval;
4357   int count, i;
4358
4359   if (GET_CODE (x) != CONST_VECTOR
4360       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4361     return false;
4362
4363   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4364   if (firstval < minval || firstval > maxval)
4365     return false;
4366
4367   count = CONST_VECTOR_NUNITS (x);
4368   for (i = 1; i < count; i++)
4369     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4370       return false;
4371
4372   return true;
4373 }
4374
4375 bool
4376 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4377 {
4378   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4379 }
4380
4381
4382 /* N Z C V.  */
4383 #define AARCH64_CC_V 1
4384 #define AARCH64_CC_C (1 << 1)
4385 #define AARCH64_CC_Z (1 << 2)
4386 #define AARCH64_CC_N (1 << 3)
4387
4388 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
4389 static const int aarch64_nzcv_codes[] =
4390 {
4391   0,            /* EQ, Z == 1.  */
4392   AARCH64_CC_Z, /* NE, Z == 0.  */
4393   0,            /* CS, C == 1.  */
4394   AARCH64_CC_C, /* CC, C == 0.  */
4395   0,            /* MI, N == 1.  */
4396   AARCH64_CC_N, /* PL, N == 0.  */
4397   0,            /* VS, V == 1.  */
4398   AARCH64_CC_V, /* VC, V == 0.  */
4399   0,            /* HI, C ==1 && Z == 0.  */
4400   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
4401   AARCH64_CC_V, /* GE, N == V.  */
4402   0,            /* LT, N != V.  */
4403   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
4404   0,            /* LE, !(Z == 0 && N == V).  */
4405   0,            /* AL, Any.  */
4406   0             /* NV, Any.  */
4407 };
4408
4409 static void
4410 aarch64_print_operand (FILE *f, rtx x, int code)
4411 {
4412   switch (code)
4413     {
4414     /* An integer or symbol address without a preceding # sign.  */
4415     case 'c':
4416       switch (GET_CODE (x))
4417         {
4418         case CONST_INT:
4419           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4420           break;
4421
4422         case SYMBOL_REF:
4423           output_addr_const (f, x);
4424           break;
4425
4426         case CONST:
4427           if (GET_CODE (XEXP (x, 0)) == PLUS
4428               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4429             {
4430               output_addr_const (f, x);
4431               break;
4432             }
4433           /* Fall through.  */
4434
4435         default:
4436           output_operand_lossage ("Unsupported operand for code '%c'", code);
4437         }
4438       break;
4439
4440     case 'e':
4441       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4442       {
4443         int n;
4444
4445         if (!CONST_INT_P (x)
4446             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4447           {
4448             output_operand_lossage ("invalid operand for '%%%c'", code);
4449             return;
4450           }
4451
4452         switch (n)
4453           {
4454           case 3:
4455             fputc ('b', f);
4456             break;
4457           case 4:
4458             fputc ('h', f);
4459             break;
4460           case 5:
4461             fputc ('w', f);
4462             break;
4463           default:
4464             output_operand_lossage ("invalid operand for '%%%c'", code);
4465             return;
4466           }
4467       }
4468       break;
4469
4470     case 'p':
4471       {
4472         int n;
4473
4474         /* Print N such that 2^N == X.  */
4475         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4476           {
4477             output_operand_lossage ("invalid operand for '%%%c'", code);
4478             return;
4479           }
4480
4481         asm_fprintf (f, "%d", n);
4482       }
4483       break;
4484
4485     case 'P':
4486       /* Print the number of non-zero bits in X (a const_int).  */
4487       if (!CONST_INT_P (x))
4488         {
4489           output_operand_lossage ("invalid operand for '%%%c'", code);
4490           return;
4491         }
4492
4493       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4494       break;
4495
4496     case 'H':
4497       /* Print the higher numbered register of a pair (TImode) of regs.  */
4498       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4499         {
4500           output_operand_lossage ("invalid operand for '%%%c'", code);
4501           return;
4502         }
4503
4504       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4505       break;
4506
4507     case 'M':
4508     case 'm':
4509       {
4510         int cond_code;
4511         /* Print a condition (eq, ne, etc) or its inverse.  */
4512
4513         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
4514         if (x == const_true_rtx)
4515           {
4516             if (code == 'M')
4517               fputs ("nv", f);
4518             return;
4519           }
4520
4521         if (!COMPARISON_P (x))
4522           {
4523             output_operand_lossage ("invalid operand for '%%%c'", code);
4524             return;
4525           }
4526
4527         cond_code = aarch64_get_condition_code (x);
4528         gcc_assert (cond_code >= 0);
4529         if (code == 'M')
4530           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
4531         fputs (aarch64_condition_codes[cond_code], f);
4532       }
4533       break;
4534
4535     case 'b':
4536     case 'h':
4537     case 's':
4538     case 'd':
4539     case 'q':
4540       /* Print a scalar FP/SIMD register name.  */
4541       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4542         {
4543           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4544           return;
4545         }
4546       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4547       break;
4548
4549     case 'S':
4550     case 'T':
4551     case 'U':
4552     case 'V':
4553       /* Print the first FP/SIMD register name in a list.  */
4554       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4555         {
4556           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4557           return;
4558         }
4559       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4560       break;
4561
4562     case 'R':
4563       /* Print a scalar FP/SIMD register name + 1.  */
4564       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4565         {
4566           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4567           return;
4568         }
4569       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4570       break;
4571
4572     case 'X':
4573       /* Print bottom 16 bits of integer constant in hex.  */
4574       if (!CONST_INT_P (x))
4575         {
4576           output_operand_lossage ("invalid operand for '%%%c'", code);
4577           return;
4578         }
4579       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4580       break;
4581
4582     case 'w':
4583     case 'x':
4584       /* Print a general register name or the zero register (32-bit or
4585          64-bit).  */
4586       if (x == const0_rtx
4587           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4588         {
4589           asm_fprintf (f, "%czr", code);
4590           break;
4591         }
4592
4593       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4594         {
4595           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4596           break;
4597         }
4598
4599       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4600         {
4601           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4602           break;
4603         }
4604
4605       /* Fall through */
4606
4607     case 0:
4608       /* Print a normal operand, if it's a general register, then we
4609          assume DImode.  */
4610       if (x == NULL)
4611         {
4612           output_operand_lossage ("missing operand");
4613           return;
4614         }
4615
4616       switch (GET_CODE (x))
4617         {
4618         case REG:
4619           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4620           break;
4621
4622         case MEM:
4623           output_address (GET_MODE (x), XEXP (x, 0));
4624           break;
4625
4626         case CONST:
4627         case LABEL_REF:
4628         case SYMBOL_REF:
4629           output_addr_const (asm_out_file, x);
4630           break;
4631
4632         case CONST_INT:
4633           asm_fprintf (f, "%wd", INTVAL (x));
4634           break;
4635
4636         case CONST_VECTOR:
4637           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4638             {
4639               gcc_assert (
4640                   aarch64_const_vec_all_same_in_range_p (x,
4641                                                          HOST_WIDE_INT_MIN,
4642                                                          HOST_WIDE_INT_MAX));
4643               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4644             }
4645           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4646             {
4647               fputc ('0', f);
4648             }
4649           else
4650             gcc_unreachable ();
4651           break;
4652
4653         case CONST_DOUBLE:
4654           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4655              be getting CONST_DOUBLEs holding integers.  */
4656           gcc_assert (GET_MODE (x) != VOIDmode);
4657           if (aarch64_float_const_zero_rtx_p (x))
4658             {
4659               fputc ('0', f);
4660               break;
4661             }
4662           else if (aarch64_float_const_representable_p (x))
4663             {
4664 #define buf_size 20
4665               char float_buf[buf_size] = {'\0'};
4666               real_to_decimal_for_mode (float_buf,
4667                                         CONST_DOUBLE_REAL_VALUE (x),
4668                                         buf_size, buf_size,
4669                                         1, GET_MODE (x));
4670               asm_fprintf (asm_out_file, "%s", float_buf);
4671               break;
4672 #undef buf_size
4673             }
4674           output_operand_lossage ("invalid constant");
4675           return;
4676         default:
4677           output_operand_lossage ("invalid operand");
4678           return;
4679         }
4680       break;
4681
4682     case 'A':
4683       if (GET_CODE (x) == HIGH)
4684         x = XEXP (x, 0);
4685
4686       switch (aarch64_classify_symbolic_expression (x))
4687         {
4688         case SYMBOL_SMALL_GOT_4G:
4689           asm_fprintf (asm_out_file, ":got:");
4690           break;
4691
4692         case SYMBOL_SMALL_TLSGD:
4693           asm_fprintf (asm_out_file, ":tlsgd:");
4694           break;
4695
4696         case SYMBOL_SMALL_TLSDESC:
4697           asm_fprintf (asm_out_file, ":tlsdesc:");
4698           break;
4699
4700         case SYMBOL_SMALL_TLSIE:
4701           asm_fprintf (asm_out_file, ":gottprel:");
4702           break;
4703
4704         case SYMBOL_TLSLE24:
4705           asm_fprintf (asm_out_file, ":tprel:");
4706           break;
4707
4708         case SYMBOL_TINY_GOT:
4709           gcc_unreachable ();
4710           break;
4711
4712         default:
4713           break;
4714         }
4715       output_addr_const (asm_out_file, x);
4716       break;
4717
4718     case 'L':
4719       switch (aarch64_classify_symbolic_expression (x))
4720         {
4721         case SYMBOL_SMALL_GOT_4G:
4722           asm_fprintf (asm_out_file, ":lo12:");
4723           break;
4724
4725         case SYMBOL_SMALL_TLSGD:
4726           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4727           break;
4728
4729         case SYMBOL_SMALL_TLSDESC:
4730           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4731           break;
4732
4733         case SYMBOL_SMALL_TLSIE:
4734           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4735           break;
4736
4737         case SYMBOL_TLSLE12:
4738           asm_fprintf (asm_out_file, ":tprel_lo12:");
4739           break;
4740
4741         case SYMBOL_TLSLE24:
4742           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4743           break;
4744
4745         case SYMBOL_TINY_GOT:
4746           asm_fprintf (asm_out_file, ":got:");
4747           break;
4748
4749         case SYMBOL_TINY_TLSIE:
4750           asm_fprintf (asm_out_file, ":gottprel:");
4751           break;
4752
4753         default:
4754           break;
4755         }
4756       output_addr_const (asm_out_file, x);
4757       break;
4758
4759     case 'G':
4760
4761       switch (aarch64_classify_symbolic_expression (x))
4762         {
4763         case SYMBOL_TLSLE24:
4764           asm_fprintf (asm_out_file, ":tprel_hi12:");
4765           break;
4766         default:
4767           break;
4768         }
4769       output_addr_const (asm_out_file, x);
4770       break;
4771
4772     case 'k':
4773       {
4774         HOST_WIDE_INT cond_code;
4775         /* Print nzcv.  */
4776
4777         if (!CONST_INT_P (x))
4778           {
4779             output_operand_lossage ("invalid operand for '%%%c'", code);
4780             return;
4781           }
4782
4783         cond_code = INTVAL (x);
4784         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
4785         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
4786       }
4787       break;
4788
4789     default:
4790       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4791       return;
4792     }
4793 }
4794
4795 static void
4796 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
4797 {
4798   struct aarch64_address_info addr;
4799
4800   if (aarch64_classify_address (&addr, x, mode, MEM, true))
4801     switch (addr.type)
4802       {
4803       case ADDRESS_REG_IMM:
4804         if (addr.offset == const0_rtx)
4805           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4806         else
4807           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4808                        INTVAL (addr.offset));
4809         return;
4810
4811       case ADDRESS_REG_REG:
4812         if (addr.shift == 0)
4813           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4814                        reg_names [REGNO (addr.offset)]);
4815         else
4816           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4817                        reg_names [REGNO (addr.offset)], addr.shift);
4818         return;
4819
4820       case ADDRESS_REG_UXTW:
4821         if (addr.shift == 0)
4822           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4823                        REGNO (addr.offset) - R0_REGNUM);
4824         else
4825           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4826                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4827         return;
4828
4829       case ADDRESS_REG_SXTW:
4830         if (addr.shift == 0)
4831           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4832                        REGNO (addr.offset) - R0_REGNUM);
4833         else
4834           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4835                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4836         return;
4837
4838       case ADDRESS_REG_WB:
4839         switch (GET_CODE (x))
4840           {
4841           case PRE_INC:
4842             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4843                          GET_MODE_SIZE (mode));
4844             return;
4845           case POST_INC:
4846             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4847                          GET_MODE_SIZE (mode));
4848             return;
4849           case PRE_DEC:
4850             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4851                          GET_MODE_SIZE (mode));
4852             return;
4853           case POST_DEC:
4854             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4855                          GET_MODE_SIZE (mode));
4856             return;
4857           case PRE_MODIFY:
4858             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4859                          INTVAL (addr.offset));
4860             return;
4861           case POST_MODIFY:
4862             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4863                          INTVAL (addr.offset));
4864             return;
4865           default:
4866             break;
4867           }
4868         break;
4869
4870       case ADDRESS_LO_SUM:
4871         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4872         output_addr_const (f, addr.offset);
4873         asm_fprintf (f, "]");
4874         return;
4875
4876       case ADDRESS_SYMBOLIC:
4877         break;
4878       }
4879
4880   output_addr_const (f, x);
4881 }
4882
4883 bool
4884 aarch64_label_mentioned_p (rtx x)
4885 {
4886   const char *fmt;
4887   int i;
4888
4889   if (GET_CODE (x) == LABEL_REF)
4890     return true;
4891
4892   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4893      referencing instruction, but they are constant offsets, not
4894      symbols.  */
4895   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4896     return false;
4897
4898   fmt = GET_RTX_FORMAT (GET_CODE (x));
4899   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4900     {
4901       if (fmt[i] == 'E')
4902         {
4903           int j;
4904
4905           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4906             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4907               return 1;
4908         }
4909       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4910         return 1;
4911     }
4912
4913   return 0;
4914 }
4915
4916 /* Implement REGNO_REG_CLASS.  */
4917
4918 enum reg_class
4919 aarch64_regno_regclass (unsigned regno)
4920 {
4921   if (GP_REGNUM_P (regno))
4922     return GENERAL_REGS;
4923
4924   if (regno == SP_REGNUM)
4925     return STACK_REG;
4926
4927   if (regno == FRAME_POINTER_REGNUM
4928       || regno == ARG_POINTER_REGNUM)
4929     return POINTER_REGS;
4930
4931   if (FP_REGNUM_P (regno))
4932     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4933
4934   return NO_REGS;
4935 }
4936
4937 static rtx
4938 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4939 {
4940   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4941      where mask is selected by alignment and size of the offset.
4942      We try to pick as large a range for the offset as possible to
4943      maximize the chance of a CSE.  However, for aligned addresses
4944      we limit the range to 4k so that structures with different sized
4945      elements are likely to use the same base.  We need to be careful
4946      not to split a CONST for some forms of address expression, otherwise
4947      it will generate sub-optimal code.  */
4948
4949   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4950     {
4951       rtx base = XEXP (x, 0);
4952       rtx offset_rtx XEXP (x, 1);
4953       HOST_WIDE_INT offset = INTVAL (offset_rtx);
4954
4955       if (GET_CODE (base) == PLUS)
4956         {
4957           rtx op0 = XEXP (base, 0);
4958           rtx op1 = XEXP (base, 1);
4959
4960           /* Force any scaling into a temp for CSE.  */
4961           op0 = force_reg (Pmode, op0);
4962           op1 = force_reg (Pmode, op1);
4963
4964           /* Let the pointer register be in op0.  */
4965           if (REG_POINTER (op1))
4966             std::swap (op0, op1);
4967
4968           /* If the pointer is virtual or frame related, then we know that
4969              virtual register instantiation or register elimination is going
4970              to apply a second constant.  We want the two constants folded
4971              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
4972           if (virt_or_elim_regno_p (REGNO (op0)))
4973             {
4974               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
4975                                    NULL_RTX, true, OPTAB_DIRECT);
4976               return gen_rtx_PLUS (Pmode, base, op1);
4977             }
4978
4979           /* Otherwise, in order to encourage CSE (and thence loop strength
4980              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
4981           base = expand_binop (Pmode, add_optab, op0, op1,
4982                                NULL_RTX, true, OPTAB_DIRECT);
4983           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
4984         }
4985
4986       /* Does it look like we'll need a load/store-pair operation?  */
4987       HOST_WIDE_INT base_offset;
4988       if (GET_MODE_SIZE (mode) > 16
4989           || mode == TImode)
4990         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4991                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
4992       /* For offsets aren't a multiple of the access size, the limit is
4993          -256...255.  */
4994       else if (offset & (GET_MODE_SIZE (mode) - 1))
4995         base_offset = (offset + 0x100) & ~0x1ff;
4996       else
4997         base_offset = offset & ~0xfff;
4998
4999       if (base_offset != 0)
5000         {
5001           base = plus_constant (Pmode, base, base_offset);
5002           base = force_operand (base, NULL_RTX);
5003           return plus_constant (Pmode, base, offset - base_offset);
5004         }
5005     }
5006
5007   return x;
5008 }
5009
5010 /* Return the reload icode required for a constant pool in mode.  */
5011 static enum insn_code
5012 aarch64_constant_pool_reload_icode (machine_mode mode)
5013 {
5014   switch (mode)
5015     {
5016     case SFmode:
5017       return CODE_FOR_aarch64_reload_movcpsfdi;
5018
5019     case DFmode:
5020       return CODE_FOR_aarch64_reload_movcpdfdi;
5021
5022     case TFmode:
5023       return CODE_FOR_aarch64_reload_movcptfdi;
5024
5025     case V8QImode:
5026       return CODE_FOR_aarch64_reload_movcpv8qidi;
5027
5028     case V16QImode:
5029       return CODE_FOR_aarch64_reload_movcpv16qidi;
5030
5031     case V4HImode:
5032       return CODE_FOR_aarch64_reload_movcpv4hidi;
5033
5034     case V8HImode:
5035       return CODE_FOR_aarch64_reload_movcpv8hidi;
5036
5037     case V2SImode:
5038       return CODE_FOR_aarch64_reload_movcpv2sidi;
5039
5040     case V4SImode:
5041       return CODE_FOR_aarch64_reload_movcpv4sidi;
5042
5043     case V2DImode:
5044       return CODE_FOR_aarch64_reload_movcpv2didi;
5045
5046     case V2DFmode:
5047       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5048
5049     default:
5050       gcc_unreachable ();
5051     }
5052
5053   gcc_unreachable ();
5054 }
5055 static reg_class_t
5056 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5057                           reg_class_t rclass,
5058                           machine_mode mode,
5059                           secondary_reload_info *sri)
5060 {
5061
5062   /* If we have to disable direct literal pool loads and stores because the
5063      function is too big, then we need a scratch register.  */
5064   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5065       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5066           || targetm.vector_mode_supported_p (GET_MODE (x)))
5067       && aarch64_nopcrelative_literal_loads)
5068     {
5069       sri->icode = aarch64_constant_pool_reload_icode (mode);
5070       return NO_REGS;
5071     }
5072
5073   /* Without the TARGET_SIMD instructions we cannot move a Q register
5074      to a Q register directly.  We need a scratch.  */
5075   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5076       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5077       && reg_class_subset_p (rclass, FP_REGS))
5078     {
5079       if (mode == TFmode)
5080         sri->icode = CODE_FOR_aarch64_reload_movtf;
5081       else if (mode == TImode)
5082         sri->icode = CODE_FOR_aarch64_reload_movti;
5083       return NO_REGS;
5084     }
5085
5086   /* A TFmode or TImode memory access should be handled via an FP_REGS
5087      because AArch64 has richer addressing modes for LDR/STR instructions
5088      than LDP/STP instructions.  */
5089   if (TARGET_FLOAT && rclass == GENERAL_REGS
5090       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5091     return FP_REGS;
5092
5093   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5094       return GENERAL_REGS;
5095
5096   return NO_REGS;
5097 }
5098
5099 static bool
5100 aarch64_can_eliminate (const int from, const int to)
5101 {
5102   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5103      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5104
5105   if (frame_pointer_needed)
5106     {
5107       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5108         return true;
5109       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5110         return false;
5111       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5112           && !cfun->calls_alloca)
5113         return true;
5114       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5115         return true;
5116
5117       return false;
5118     }
5119   else
5120     {
5121       /* If we decided that we didn't need a leaf frame pointer but then used
5122          LR in the function, then we'll want a frame pointer after all, so
5123          prevent this elimination to ensure a frame pointer is used.  */
5124       if (to == STACK_POINTER_REGNUM
5125           && flag_omit_leaf_frame_pointer
5126           && df_regs_ever_live_p (LR_REGNUM))
5127         return false;
5128     }
5129
5130   return true;
5131 }
5132
5133 HOST_WIDE_INT
5134 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5135 {
5136   aarch64_layout_frame ();
5137
5138   if (to == HARD_FRAME_POINTER_REGNUM)
5139     {
5140       if (from == ARG_POINTER_REGNUM)
5141         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
5142
5143       if (from == FRAME_POINTER_REGNUM)
5144         return (cfun->machine->frame.hard_fp_offset
5145                 - cfun->machine->frame.saved_varargs_size);
5146     }
5147
5148   if (to == STACK_POINTER_REGNUM)
5149     {
5150       if (from == FRAME_POINTER_REGNUM)
5151           return (cfun->machine->frame.frame_size
5152                   - cfun->machine->frame.saved_varargs_size);
5153     }
5154
5155   return cfun->machine->frame.frame_size;
5156 }
5157
5158 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5159    previous frame.  */
5160
5161 rtx
5162 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5163 {
5164   if (count != 0)
5165     return const0_rtx;
5166   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5167 }
5168
5169
5170 static void
5171 aarch64_asm_trampoline_template (FILE *f)
5172 {
5173   if (TARGET_ILP32)
5174     {
5175       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5176       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5177     }
5178   else
5179     {
5180       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5181       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5182     }
5183   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5184   assemble_aligned_integer (4, const0_rtx);
5185   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5186   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5187 }
5188
5189 static void
5190 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5191 {
5192   rtx fnaddr, mem, a_tramp;
5193   const int tramp_code_sz = 16;
5194
5195   /* Don't need to copy the trailing D-words, we fill those in below.  */
5196   emit_block_move (m_tramp, assemble_trampoline_template (),
5197                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5198   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5199   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5200   if (GET_MODE (fnaddr) != ptr_mode)
5201     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5202   emit_move_insn (mem, fnaddr);
5203
5204   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5205   emit_move_insn (mem, chain_value);
5206
5207   /* XXX We should really define a "clear_cache" pattern and use
5208      gen_clear_cache().  */
5209   a_tramp = XEXP (m_tramp, 0);
5210   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5211                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5212                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5213                      ptr_mode);
5214 }
5215
5216 static unsigned char
5217 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5218 {
5219   switch (regclass)
5220     {
5221     case CALLER_SAVE_REGS:
5222     case POINTER_REGS:
5223     case GENERAL_REGS:
5224     case ALL_REGS:
5225     case FP_REGS:
5226     case FP_LO_REGS:
5227       return
5228         aarch64_vector_mode_p (mode)
5229           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5230           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5231     case STACK_REG:
5232       return 1;
5233
5234     case NO_REGS:
5235       return 0;
5236
5237     default:
5238       break;
5239     }
5240   gcc_unreachable ();
5241 }
5242
5243 static reg_class_t
5244 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5245 {
5246   if (regclass == POINTER_REGS)
5247     return GENERAL_REGS;
5248
5249   if (regclass == STACK_REG)
5250     {
5251       if (REG_P(x)
5252           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5253           return regclass;
5254
5255       return NO_REGS;
5256     }
5257
5258   /* If it's an integer immediate that MOVI can't handle, then
5259      FP_REGS is not an option, so we return NO_REGS instead.  */
5260   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5261       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5262     return NO_REGS;
5263
5264   /* Register eliminiation can result in a request for
5265      SP+constant->FP_REGS.  We cannot support such operations which
5266      use SP as source and an FP_REG as destination, so reject out
5267      right now.  */
5268   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5269     {
5270       rtx lhs = XEXP (x, 0);
5271
5272       /* Look through a possible SUBREG introduced by ILP32.  */
5273       if (GET_CODE (lhs) == SUBREG)
5274         lhs = SUBREG_REG (lhs);
5275
5276       gcc_assert (REG_P (lhs));
5277       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5278                                       POINTER_REGS));
5279       return NO_REGS;
5280     }
5281
5282   return regclass;
5283 }
5284
5285 void
5286 aarch64_asm_output_labelref (FILE* f, const char *name)
5287 {
5288   asm_fprintf (f, "%U%s", name);
5289 }
5290
5291 static void
5292 aarch64_elf_asm_constructor (rtx symbol, int priority)
5293 {
5294   if (priority == DEFAULT_INIT_PRIORITY)
5295     default_ctor_section_asm_out_constructor (symbol, priority);
5296   else
5297     {
5298       section *s;
5299       char buf[18];
5300       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5301       s = get_section (buf, SECTION_WRITE, NULL);
5302       switch_to_section (s);
5303       assemble_align (POINTER_SIZE);
5304       assemble_aligned_integer (POINTER_BYTES, symbol);
5305     }
5306 }
5307
5308 static void
5309 aarch64_elf_asm_destructor (rtx symbol, int priority)
5310 {
5311   if (priority == DEFAULT_INIT_PRIORITY)
5312     default_dtor_section_asm_out_destructor (symbol, priority);
5313   else
5314     {
5315       section *s;
5316       char buf[18];
5317       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5318       s = get_section (buf, SECTION_WRITE, NULL);
5319       switch_to_section (s);
5320       assemble_align (POINTER_SIZE);
5321       assemble_aligned_integer (POINTER_BYTES, symbol);
5322     }
5323 }
5324
5325 const char*
5326 aarch64_output_casesi (rtx *operands)
5327 {
5328   char buf[100];
5329   char label[100];
5330   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5331   int index;
5332   static const char *const patterns[4][2] =
5333   {
5334     {
5335       "ldrb\t%w3, [%0,%w1,uxtw]",
5336       "add\t%3, %4, %w3, sxtb #2"
5337     },
5338     {
5339       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5340       "add\t%3, %4, %w3, sxth #2"
5341     },
5342     {
5343       "ldr\t%w3, [%0,%w1,uxtw #2]",
5344       "add\t%3, %4, %w3, sxtw #2"
5345     },
5346     /* We assume that DImode is only generated when not optimizing and
5347        that we don't really need 64-bit address offsets.  That would
5348        imply an object file with 8GB of code in a single function!  */
5349     {
5350       "ldr\t%w3, [%0,%w1,uxtw #2]",
5351       "add\t%3, %4, %w3, sxtw #2"
5352     }
5353   };
5354
5355   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5356
5357   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5358
5359   gcc_assert (index >= 0 && index <= 3);
5360
5361   /* Need to implement table size reduction, by chaning the code below.  */
5362   output_asm_insn (patterns[index][0], operands);
5363   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5364   snprintf (buf, sizeof (buf),
5365             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5366   output_asm_insn (buf, operands);
5367   output_asm_insn (patterns[index][1], operands);
5368   output_asm_insn ("br\t%3", operands);
5369   assemble_label (asm_out_file, label);
5370   return "";
5371 }
5372
5373
5374 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5375    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5376    operator.  */
5377
5378 int
5379 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5380 {
5381   if (shift >= 0 && shift <= 3)
5382     {
5383       int size;
5384       for (size = 8; size <= 32; size *= 2)
5385         {
5386           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5387           if (mask == bits << shift)
5388             return size;
5389         }
5390     }
5391   return 0;
5392 }
5393
5394 /* Constant pools are per function only when PC relative
5395    literal loads are true or we are in the large memory
5396    model.  */
5397
5398 static inline bool
5399 aarch64_can_use_per_function_literal_pools_p (void)
5400 {
5401   return (!aarch64_nopcrelative_literal_loads
5402           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5403 }
5404
5405 static bool
5406 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5407 {
5408   /* Fixme:: In an ideal world this would work similar
5409      to the logic in aarch64_select_rtx_section but this
5410      breaks bootstrap in gcc go.  For now we workaround
5411      this by returning false here.  */
5412   return false;
5413 }
5414
5415 /* Select appropriate section for constants depending
5416    on where we place literal pools.  */
5417
5418 static section *
5419 aarch64_select_rtx_section (machine_mode mode,
5420                             rtx x,
5421                             unsigned HOST_WIDE_INT align)
5422 {
5423   if (aarch64_can_use_per_function_literal_pools_p ())
5424     return function_section (current_function_decl);
5425
5426   return default_elf_select_rtx_section (mode, x, align);
5427 }
5428
5429 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
5430 void
5431 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5432                                   HOST_WIDE_INT offset)
5433 {
5434   /* When using per-function literal pools, we must ensure that any code
5435      section is aligned to the minimal instruction length, lest we get
5436      errors from the assembler re "unaligned instructions".  */
5437   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5438     ASM_OUTPUT_ALIGN (f, 2);
5439 }
5440
5441 /* Costs.  */
5442
5443 /* Helper function for rtx cost calculation.  Strip a shift expression
5444    from X.  Returns the inner operand if successful, or the original
5445    expression on failure.  */
5446 static rtx
5447 aarch64_strip_shift (rtx x)
5448 {
5449   rtx op = x;
5450
5451   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5452      we can convert both to ROR during final output.  */
5453   if ((GET_CODE (op) == ASHIFT
5454        || GET_CODE (op) == ASHIFTRT
5455        || GET_CODE (op) == LSHIFTRT
5456        || GET_CODE (op) == ROTATERT
5457        || GET_CODE (op) == ROTATE)
5458       && CONST_INT_P (XEXP (op, 1)))
5459     return XEXP (op, 0);
5460
5461   if (GET_CODE (op) == MULT
5462       && CONST_INT_P (XEXP (op, 1))
5463       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5464     return XEXP (op, 0);
5465
5466   return x;
5467 }
5468
5469 /* Helper function for rtx cost calculation.  Strip an extend
5470    expression from X.  Returns the inner operand if successful, or the
5471    original expression on failure.  We deal with a number of possible
5472    canonicalization variations here.  */
5473 static rtx
5474 aarch64_strip_extend (rtx x)
5475 {
5476   rtx op = x;
5477
5478   /* Zero and sign extraction of a widened value.  */
5479   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5480       && XEXP (op, 2) == const0_rtx
5481       && GET_CODE (XEXP (op, 0)) == MULT
5482       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5483                                          XEXP (op, 1)))
5484     return XEXP (XEXP (op, 0), 0);
5485
5486   /* It can also be represented (for zero-extend) as an AND with an
5487      immediate.  */
5488   if (GET_CODE (op) == AND
5489       && GET_CODE (XEXP (op, 0)) == MULT
5490       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5491       && CONST_INT_P (XEXP (op, 1))
5492       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5493                            INTVAL (XEXP (op, 1))) != 0)
5494     return XEXP (XEXP (op, 0), 0);
5495
5496   /* Now handle extended register, as this may also have an optional
5497      left shift by 1..4.  */
5498   if (GET_CODE (op) == ASHIFT
5499       && CONST_INT_P (XEXP (op, 1))
5500       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5501     op = XEXP (op, 0);
5502
5503   if (GET_CODE (op) == ZERO_EXTEND
5504       || GET_CODE (op) == SIGN_EXTEND)
5505     op = XEXP (op, 0);
5506
5507   if (op != x)
5508     return op;
5509
5510   return x;
5511 }
5512
5513 /* Return true iff CODE is a shift supported in combination
5514    with arithmetic instructions.  */
5515
5516 static bool
5517 aarch64_shift_p (enum rtx_code code)
5518 {
5519   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5520 }
5521
5522 /* Helper function for rtx cost calculation.  Calculate the cost of
5523    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5524    Return the calculated cost of the expression, recursing manually in to
5525    operands where needed.  */
5526
5527 static int
5528 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5529 {
5530   rtx op0, op1;
5531   const struct cpu_cost_table *extra_cost
5532     = aarch64_tune_params.insn_extra_cost;
5533   int cost = 0;
5534   bool compound_p = (outer == PLUS || outer == MINUS);
5535   machine_mode mode = GET_MODE (x);
5536
5537   gcc_checking_assert (code == MULT);
5538
5539   op0 = XEXP (x, 0);
5540   op1 = XEXP (x, 1);
5541
5542   if (VECTOR_MODE_P (mode))
5543     mode = GET_MODE_INNER (mode);
5544
5545   /* Integer multiply/fma.  */
5546   if (GET_MODE_CLASS (mode) == MODE_INT)
5547     {
5548       /* The multiply will be canonicalized as a shift, cost it as such.  */
5549       if (aarch64_shift_p (GET_CODE (x))
5550           || (CONST_INT_P (op1)
5551               && exact_log2 (INTVAL (op1)) > 0))
5552         {
5553           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5554                            || GET_CODE (op0) == SIGN_EXTEND;
5555           if (speed)
5556             {
5557               if (compound_p)
5558                 {
5559                   if (REG_P (op1))
5560                     /* ARITH + shift-by-register.  */
5561                     cost += extra_cost->alu.arith_shift_reg;
5562                   else if (is_extend)
5563                     /* ARITH + extended register.  We don't have a cost field
5564                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5565                     cost += extra_cost->alu.extend_arith;
5566                   else
5567                     /* ARITH + shift-by-immediate.  */
5568                     cost += extra_cost->alu.arith_shift;
5569                 }
5570               else
5571                 /* LSL (immediate).  */
5572                 cost += extra_cost->alu.shift;
5573
5574             }
5575           /* Strip extends as we will have costed them in the case above.  */
5576           if (is_extend)
5577             op0 = aarch64_strip_extend (op0);
5578
5579           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5580
5581           return cost;
5582         }
5583
5584       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5585          compound and let the below cases handle it.  After all, MNEG is a
5586          special-case alias of MSUB.  */
5587       if (GET_CODE (op0) == NEG)
5588         {
5589           op0 = XEXP (op0, 0);
5590           compound_p = true;
5591         }
5592
5593       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5594       if ((GET_CODE (op0) == ZERO_EXTEND
5595            && GET_CODE (op1) == ZERO_EXTEND)
5596           || (GET_CODE (op0) == SIGN_EXTEND
5597               && GET_CODE (op1) == SIGN_EXTEND))
5598         {
5599           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5600           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5601
5602           if (speed)
5603             {
5604               if (compound_p)
5605                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5606                 cost += extra_cost->mult[0].extend_add;
5607               else
5608                 /* MUL/SMULL/UMULL.  */
5609                 cost += extra_cost->mult[0].extend;
5610             }
5611
5612           return cost;
5613         }
5614
5615       /* This is either an integer multiply or a MADD.  In both cases
5616          we want to recurse and cost the operands.  */
5617       cost += rtx_cost (op0, mode, MULT, 0, speed);
5618       cost += rtx_cost (op1, mode, MULT, 1, speed);
5619
5620       if (speed)
5621         {
5622           if (compound_p)
5623             /* MADD/MSUB.  */
5624             cost += extra_cost->mult[mode == DImode].add;
5625           else
5626             /* MUL.  */
5627             cost += extra_cost->mult[mode == DImode].simple;
5628         }
5629
5630       return cost;
5631     }
5632   else
5633     {
5634       if (speed)
5635         {
5636           /* Floating-point FMA/FMUL can also support negations of the
5637              operands, unless the rounding mode is upward or downward in
5638              which case FNMUL is different than FMUL with operand negation.  */
5639           bool neg0 = GET_CODE (op0) == NEG;
5640           bool neg1 = GET_CODE (op1) == NEG;
5641           if (compound_p || !flag_rounding_math || (neg0 && neg1))
5642             {
5643               if (neg0)
5644                 op0 = XEXP (op0, 0);
5645               if (neg1)
5646                 op1 = XEXP (op1, 0);
5647             }
5648
5649           if (compound_p)
5650             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5651             cost += extra_cost->fp[mode == DFmode].fma;
5652           else
5653             /* FMUL/FNMUL.  */
5654             cost += extra_cost->fp[mode == DFmode].mult;
5655         }
5656
5657       cost += rtx_cost (op0, mode, MULT, 0, speed);
5658       cost += rtx_cost (op1, mode, MULT, 1, speed);
5659       return cost;
5660     }
5661 }
5662
5663 static int
5664 aarch64_address_cost (rtx x,
5665                       machine_mode mode,
5666                       addr_space_t as ATTRIBUTE_UNUSED,
5667                       bool speed)
5668 {
5669   enum rtx_code c = GET_CODE (x);
5670   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5671   struct aarch64_address_info info;
5672   int cost = 0;
5673   info.shift = 0;
5674
5675   if (!aarch64_classify_address (&info, x, mode, c, false))
5676     {
5677       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5678         {
5679           /* This is a CONST or SYMBOL ref which will be split
5680              in a different way depending on the code model in use.
5681              Cost it through the generic infrastructure.  */
5682           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5683           /* Divide through by the cost of one instruction to
5684              bring it to the same units as the address costs.  */
5685           cost_symbol_ref /= COSTS_N_INSNS (1);
5686           /* The cost is then the cost of preparing the address,
5687              followed by an immediate (possibly 0) offset.  */
5688           return cost_symbol_ref + addr_cost->imm_offset;
5689         }
5690       else
5691         {
5692           /* This is most likely a jump table from a case
5693              statement.  */
5694           return addr_cost->register_offset;
5695         }
5696     }
5697
5698   switch (info.type)
5699     {
5700       case ADDRESS_LO_SUM:
5701       case ADDRESS_SYMBOLIC:
5702       case ADDRESS_REG_IMM:
5703         cost += addr_cost->imm_offset;
5704         break;
5705
5706       case ADDRESS_REG_WB:
5707         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5708           cost += addr_cost->pre_modify;
5709         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5710           cost += addr_cost->post_modify;
5711         else
5712           gcc_unreachable ();
5713
5714         break;
5715
5716       case ADDRESS_REG_REG:
5717         cost += addr_cost->register_offset;
5718         break;
5719
5720       case ADDRESS_REG_SXTW:
5721         cost += addr_cost->register_sextend;
5722         break;
5723
5724       case ADDRESS_REG_UXTW:
5725         cost += addr_cost->register_zextend;
5726         break;
5727
5728       default:
5729         gcc_unreachable ();
5730     }
5731
5732
5733   if (info.shift > 0)
5734     {
5735       /* For the sake of calculating the cost of the shifted register
5736          component, we can treat same sized modes in the same way.  */
5737       switch (GET_MODE_BITSIZE (mode))
5738         {
5739           case 16:
5740             cost += addr_cost->addr_scale_costs.hi;
5741             break;
5742
5743           case 32:
5744             cost += addr_cost->addr_scale_costs.si;
5745             break;
5746
5747           case 64:
5748             cost += addr_cost->addr_scale_costs.di;
5749             break;
5750
5751           /* We can't tell, or this is a 128-bit vector.  */
5752           default:
5753             cost += addr_cost->addr_scale_costs.ti;
5754             break;
5755         }
5756     }
5757
5758   return cost;
5759 }
5760
5761 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
5762    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
5763    to be taken.  */
5764
5765 int
5766 aarch64_branch_cost (bool speed_p, bool predictable_p)
5767 {
5768   /* When optimizing for speed, use the cost of unpredictable branches.  */
5769   const struct cpu_branch_cost *branch_costs =
5770     aarch64_tune_params.branch_costs;
5771
5772   if (!speed_p || predictable_p)
5773     return branch_costs->predictable;
5774   else
5775     return branch_costs->unpredictable;
5776 }
5777
5778 /* Return true if the RTX X in mode MODE is a zero or sign extract
5779    usable in an ADD or SUB (extended register) instruction.  */
5780 static bool
5781 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5782 {
5783   /* Catch add with a sign extract.
5784      This is add_<optab><mode>_multp2.  */
5785   if (GET_CODE (x) == SIGN_EXTRACT
5786       || GET_CODE (x) == ZERO_EXTRACT)
5787     {
5788       rtx op0 = XEXP (x, 0);
5789       rtx op1 = XEXP (x, 1);
5790       rtx op2 = XEXP (x, 2);
5791
5792       if (GET_CODE (op0) == MULT
5793           && CONST_INT_P (op1)
5794           && op2 == const0_rtx
5795           && CONST_INT_P (XEXP (op0, 1))
5796           && aarch64_is_extend_from_extract (mode,
5797                                              XEXP (op0, 1),
5798                                              op1))
5799         {
5800           return true;
5801         }
5802     }
5803   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
5804      No shift.  */
5805   else if (GET_CODE (x) == SIGN_EXTEND
5806            || GET_CODE (x) == ZERO_EXTEND)
5807     return REG_P (XEXP (x, 0));
5808
5809   return false;
5810 }
5811
5812 static bool
5813 aarch64_frint_unspec_p (unsigned int u)
5814 {
5815   switch (u)
5816     {
5817       case UNSPEC_FRINTZ:
5818       case UNSPEC_FRINTP:
5819       case UNSPEC_FRINTM:
5820       case UNSPEC_FRINTA:
5821       case UNSPEC_FRINTN:
5822       case UNSPEC_FRINTX:
5823       case UNSPEC_FRINTI:
5824         return true;
5825
5826       default:
5827         return false;
5828     }
5829 }
5830
5831 /* Return true iff X is an rtx that will match an extr instruction
5832    i.e. as described in the *extr<mode>5_insn family of patterns.
5833    OP0 and OP1 will be set to the operands of the shifts involved
5834    on success and will be NULL_RTX otherwise.  */
5835
5836 static bool
5837 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5838 {
5839   rtx op0, op1;
5840   machine_mode mode = GET_MODE (x);
5841
5842   *res_op0 = NULL_RTX;
5843   *res_op1 = NULL_RTX;
5844
5845   if (GET_CODE (x) != IOR)
5846     return false;
5847
5848   op0 = XEXP (x, 0);
5849   op1 = XEXP (x, 1);
5850
5851   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5852       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5853     {
5854      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
5855       if (GET_CODE (op1) == ASHIFT)
5856         std::swap (op0, op1);
5857
5858       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5859         return false;
5860
5861       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5862       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5863
5864       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5865           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5866         {
5867           *res_op0 = XEXP (op0, 0);
5868           *res_op1 = XEXP (op1, 0);
5869           return true;
5870         }
5871     }
5872
5873   return false;
5874 }
5875
5876 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5877    storing it in *COST.  Result is true if the total cost of the operation
5878    has now been calculated.  */
5879 static bool
5880 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5881 {
5882   rtx inner;
5883   rtx comparator;
5884   enum rtx_code cmpcode;
5885
5886   if (COMPARISON_P (op0))
5887     {
5888       inner = XEXP (op0, 0);
5889       comparator = XEXP (op0, 1);
5890       cmpcode = GET_CODE (op0);
5891     }
5892   else
5893     {
5894       inner = op0;
5895       comparator = const0_rtx;
5896       cmpcode = NE;
5897     }
5898
5899   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5900     {
5901       /* Conditional branch.  */
5902       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5903         return true;
5904       else
5905         {
5906           if (cmpcode == NE || cmpcode == EQ)
5907             {
5908               if (comparator == const0_rtx)
5909                 {
5910                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5911                   if (GET_CODE (inner) == ZERO_EXTRACT)
5912                     /* TBZ/TBNZ.  */
5913                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
5914                                        ZERO_EXTRACT, 0, speed);
5915                   else
5916                     /* CBZ/CBNZ.  */
5917                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
5918
5919                 return true;
5920               }
5921             }
5922           else if (cmpcode == LT || cmpcode == GE)
5923             {
5924               /* TBZ/TBNZ.  */
5925               if (comparator == const0_rtx)
5926                 return true;
5927             }
5928         }
5929     }
5930   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5931     {
5932       /* CCMP.  */
5933       if (GET_CODE (op1) == COMPARE)
5934         {
5935           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
5936           if (XEXP (op1, 1) == const0_rtx)
5937             *cost += 1;
5938           if (speed)
5939             {
5940               machine_mode mode = GET_MODE (XEXP (op1, 0));
5941               const struct cpu_cost_table *extra_cost
5942                 = aarch64_tune_params.insn_extra_cost;
5943
5944               if (GET_MODE_CLASS (mode) == MODE_INT)
5945                 *cost += extra_cost->alu.arith;
5946               else
5947                 *cost += extra_cost->fp[mode == DFmode].compare;
5948             }
5949           return true;
5950         }
5951
5952       /* It's a conditional operation based on the status flags,
5953          so it must be some flavor of CSEL.  */
5954
5955       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5956       if (GET_CODE (op1) == NEG
5957           || GET_CODE (op1) == NOT
5958           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5959         op1 = XEXP (op1, 0);
5960       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
5961         {
5962           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
5963           op1 = XEXP (op1, 0);
5964           op2 = XEXP (op2, 0);
5965         }
5966
5967       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
5968       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
5969       return true;
5970     }
5971
5972   /* We don't know what this is, cost all operands.  */
5973   return false;
5974 }
5975
5976 /* Check whether X is a bitfield operation of the form shift + extend that
5977    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
5978    operand to which the bitfield operation is applied.  Otherwise return
5979    NULL_RTX.  */
5980
5981 static rtx
5982 aarch64_extend_bitfield_pattern_p (rtx x)
5983 {
5984   rtx_code outer_code = GET_CODE (x);
5985   machine_mode outer_mode = GET_MODE (x);
5986
5987   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
5988       && outer_mode != SImode && outer_mode != DImode)
5989     return NULL_RTX;
5990
5991   rtx inner = XEXP (x, 0);
5992   rtx_code inner_code = GET_CODE (inner);
5993   machine_mode inner_mode = GET_MODE (inner);
5994   rtx op = NULL_RTX;
5995
5996   switch (inner_code)
5997     {
5998       case ASHIFT:
5999         if (CONST_INT_P (XEXP (inner, 1))
6000             && (inner_mode == QImode || inner_mode == HImode))
6001           op = XEXP (inner, 0);
6002         break;
6003       case LSHIFTRT:
6004         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6005             && (inner_mode == QImode || inner_mode == HImode))
6006           op = XEXP (inner, 0);
6007         break;
6008       case ASHIFTRT:
6009         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6010             && (inner_mode == QImode || inner_mode == HImode))
6011           op = XEXP (inner, 0);
6012         break;
6013       default:
6014         break;
6015     }
6016
6017   return op;
6018 }
6019
6020 /* Calculate the cost of calculating X, storing it in *COST.  Result
6021    is true if the total cost of the operation has now been calculated.  */
6022 static bool
6023 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6024                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6025 {
6026   rtx op0, op1, op2;
6027   const struct cpu_cost_table *extra_cost
6028     = aarch64_tune_params.insn_extra_cost;
6029   int code = GET_CODE (x);
6030
6031   /* By default, assume that everything has equivalent cost to the
6032      cheapest instruction.  Any additional costs are applied as a delta
6033      above this default.  */
6034   *cost = COSTS_N_INSNS (1);
6035
6036   switch (code)
6037     {
6038     case SET:
6039       /* The cost depends entirely on the operands to SET.  */
6040       *cost = 0;
6041       op0 = SET_DEST (x);
6042       op1 = SET_SRC (x);
6043
6044       switch (GET_CODE (op0))
6045         {
6046         case MEM:
6047           if (speed)
6048             {
6049               rtx address = XEXP (op0, 0);
6050               if (VECTOR_MODE_P (mode))
6051                 *cost += extra_cost->ldst.storev;
6052               else if (GET_MODE_CLASS (mode) == MODE_INT)
6053                 *cost += extra_cost->ldst.store;
6054               else if (mode == SFmode)
6055                 *cost += extra_cost->ldst.storef;
6056               else if (mode == DFmode)
6057                 *cost += extra_cost->ldst.stored;
6058
6059               *cost +=
6060                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6061                                                      0, speed));
6062             }
6063
6064           *cost += rtx_cost (op1, mode, SET, 1, speed);
6065           return true;
6066
6067         case SUBREG:
6068           if (! REG_P (SUBREG_REG (op0)))
6069             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6070
6071           /* Fall through.  */
6072         case REG:
6073           /* The cost is one per vector-register copied.  */
6074           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6075             {
6076               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6077                               / GET_MODE_SIZE (V4SImode);
6078               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6079             }
6080           /* const0_rtx is in general free, but we will use an
6081              instruction to set a register to 0.  */
6082           else if (REG_P (op1) || op1 == const0_rtx)
6083             {
6084               /* The cost is 1 per register copied.  */
6085               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6086                               / UNITS_PER_WORD;
6087               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6088             }
6089           else
6090             /* Cost is just the cost of the RHS of the set.  */
6091             *cost += rtx_cost (op1, mode, SET, 1, speed);
6092           return true;
6093
6094         case ZERO_EXTRACT:
6095         case SIGN_EXTRACT:
6096           /* Bit-field insertion.  Strip any redundant widening of
6097              the RHS to meet the width of the target.  */
6098           if (GET_CODE (op1) == SUBREG)
6099             op1 = SUBREG_REG (op1);
6100           if ((GET_CODE (op1) == ZERO_EXTEND
6101                || GET_CODE (op1) == SIGN_EXTEND)
6102               && CONST_INT_P (XEXP (op0, 1))
6103               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6104                   >= INTVAL (XEXP (op0, 1))))
6105             op1 = XEXP (op1, 0);
6106
6107           if (CONST_INT_P (op1))
6108             {
6109               /* MOV immediate is assumed to always be cheap.  */
6110               *cost = COSTS_N_INSNS (1);
6111             }
6112           else
6113             {
6114               /* BFM.  */
6115               if (speed)
6116                 *cost += extra_cost->alu.bfi;
6117               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6118             }
6119
6120           return true;
6121
6122         default:
6123           /* We can't make sense of this, assume default cost.  */
6124           *cost = COSTS_N_INSNS (1);
6125           return false;
6126         }
6127       return false;
6128
6129     case CONST_INT:
6130       /* If an instruction can incorporate a constant within the
6131          instruction, the instruction's expression avoids calling
6132          rtx_cost() on the constant.  If rtx_cost() is called on a
6133          constant, then it is usually because the constant must be
6134          moved into a register by one or more instructions.
6135
6136          The exception is constant 0, which can be expressed
6137          as XZR/WZR and is therefore free.  The exception to this is
6138          if we have (set (reg) (const0_rtx)) in which case we must cost
6139          the move.  However, we can catch that when we cost the SET, so
6140          we don't need to consider that here.  */
6141       if (x == const0_rtx)
6142         *cost = 0;
6143       else
6144         {
6145           /* To an approximation, building any other constant is
6146              proportionally expensive to the number of instructions
6147              required to build that constant.  This is true whether we
6148              are compiling for SPEED or otherwise.  */
6149           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6150                                  (NULL_RTX, x, false, mode));
6151         }
6152       return true;
6153
6154     case CONST_DOUBLE:
6155       if (speed)
6156         {
6157           /* mov[df,sf]_aarch64.  */
6158           if (aarch64_float_const_representable_p (x))
6159             /* FMOV (scalar immediate).  */
6160             *cost += extra_cost->fp[mode == DFmode].fpconst;
6161           else if (!aarch64_float_const_zero_rtx_p (x))
6162             {
6163               /* This will be a load from memory.  */
6164               if (mode == DFmode)
6165                 *cost += extra_cost->ldst.loadd;
6166               else
6167                 *cost += extra_cost->ldst.loadf;
6168             }
6169           else
6170             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
6171                or MOV v0.s[0], wzr - neither of which are modeled by the
6172                cost tables.  Just use the default cost.  */
6173             {
6174             }
6175         }
6176
6177       return true;
6178
6179     case MEM:
6180       if (speed)
6181         {
6182           /* For loads we want the base cost of a load, plus an
6183              approximation for the additional cost of the addressing
6184              mode.  */
6185           rtx address = XEXP (x, 0);
6186           if (VECTOR_MODE_P (mode))
6187             *cost += extra_cost->ldst.loadv;
6188           else if (GET_MODE_CLASS (mode) == MODE_INT)
6189             *cost += extra_cost->ldst.load;
6190           else if (mode == SFmode)
6191             *cost += extra_cost->ldst.loadf;
6192           else if (mode == DFmode)
6193             *cost += extra_cost->ldst.loadd;
6194
6195           *cost +=
6196                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6197                                                      0, speed));
6198         }
6199
6200       return true;
6201
6202     case NEG:
6203       op0 = XEXP (x, 0);
6204
6205       if (VECTOR_MODE_P (mode))
6206         {
6207           if (speed)
6208             {
6209               /* FNEG.  */
6210               *cost += extra_cost->vect.alu;
6211             }
6212           return false;
6213         }
6214
6215       if (GET_MODE_CLASS (mode) == MODE_INT)
6216         {
6217           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6218               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6219             {
6220               /* CSETM.  */
6221               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6222               return true;
6223             }
6224
6225           /* Cost this as SUB wzr, X.  */
6226           op0 = CONST0_RTX (mode);
6227           op1 = XEXP (x, 0);
6228           goto cost_minus;
6229         }
6230
6231       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6232         {
6233           /* Support (neg(fma...)) as a single instruction only if
6234              sign of zeros is unimportant.  This matches the decision
6235              making in aarch64.md.  */
6236           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6237             {
6238               /* FNMADD.  */
6239               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6240               return true;
6241             }
6242           if (GET_CODE (op0) == MULT)
6243             {
6244               /* FNMUL.  */
6245               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6246               return true;
6247             }
6248           if (speed)
6249             /* FNEG.  */
6250             *cost += extra_cost->fp[mode == DFmode].neg;
6251           return false;
6252         }
6253
6254       return false;
6255
6256     case CLRSB:
6257     case CLZ:
6258       if (speed)
6259         {
6260           if (VECTOR_MODE_P (mode))
6261             *cost += extra_cost->vect.alu;
6262           else
6263             *cost += extra_cost->alu.clz;
6264         }
6265
6266       return false;
6267
6268     case COMPARE:
6269       op0 = XEXP (x, 0);
6270       op1 = XEXP (x, 1);
6271
6272       if (op1 == const0_rtx
6273           && GET_CODE (op0) == AND)
6274         {
6275           x = op0;
6276           mode = GET_MODE (op0);
6277           goto cost_logic;
6278         }
6279
6280       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6281         {
6282           /* TODO: A write to the CC flags possibly costs extra, this
6283              needs encoding in the cost tables.  */
6284
6285           mode = GET_MODE (op0);
6286           /* ANDS.  */
6287           if (GET_CODE (op0) == AND)
6288             {
6289               x = op0;
6290               goto cost_logic;
6291             }
6292
6293           if (GET_CODE (op0) == PLUS)
6294             {
6295               /* ADDS (and CMN alias).  */
6296               x = op0;
6297               goto cost_plus;
6298             }
6299
6300           if (GET_CODE (op0) == MINUS)
6301             {
6302               /* SUBS.  */
6303               x = op0;
6304               goto cost_minus;
6305             }
6306
6307           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6308               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6309               && CONST_INT_P (XEXP (op0, 2)))
6310             {
6311               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6312                  Handle it here directly rather than going to cost_logic
6313                  since we know the immediate generated for the TST is valid
6314                  so we can avoid creating an intermediate rtx for it only
6315                  for costing purposes.  */
6316               if (speed)
6317                 *cost += extra_cost->alu.logical;
6318
6319               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6320                                  ZERO_EXTRACT, 0, speed);
6321               return true;
6322             }
6323
6324           if (GET_CODE (op1) == NEG)
6325             {
6326               /* CMN.  */
6327               if (speed)
6328                 *cost += extra_cost->alu.arith;
6329
6330               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6331               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6332               return true;
6333             }
6334
6335           /* CMP.
6336
6337              Compare can freely swap the order of operands, and
6338              canonicalization puts the more complex operation first.
6339              But the integer MINUS logic expects the shift/extend
6340              operation in op1.  */
6341           if (! (REG_P (op0)
6342                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6343           {
6344             op0 = XEXP (x, 1);
6345             op1 = XEXP (x, 0);
6346           }
6347           goto cost_minus;
6348         }
6349
6350       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6351         {
6352           /* FCMP.  */
6353           if (speed)
6354             *cost += extra_cost->fp[mode == DFmode].compare;
6355
6356           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6357             {
6358               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6359               /* FCMP supports constant 0.0 for no extra cost. */
6360               return true;
6361             }
6362           return false;
6363         }
6364
6365       if (VECTOR_MODE_P (mode))
6366         {
6367           /* Vector compare.  */
6368           if (speed)
6369             *cost += extra_cost->vect.alu;
6370
6371           if (aarch64_float_const_zero_rtx_p (op1))
6372             {
6373               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6374                  cost.  */
6375               return true;
6376             }
6377           return false;
6378         }
6379       return false;
6380
6381     case MINUS:
6382       {
6383         op0 = XEXP (x, 0);
6384         op1 = XEXP (x, 1);
6385
6386 cost_minus:
6387         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6388
6389         /* Detect valid immediates.  */
6390         if ((GET_MODE_CLASS (mode) == MODE_INT
6391              || (GET_MODE_CLASS (mode) == MODE_CC
6392                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6393             && CONST_INT_P (op1)
6394             && aarch64_uimm12_shift (INTVAL (op1)))
6395           {
6396             if (speed)
6397               /* SUB(S) (immediate).  */
6398               *cost += extra_cost->alu.arith;
6399             return true;
6400           }
6401
6402         /* Look for SUB (extended register).  */
6403         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6404           {
6405             if (speed)
6406               *cost += extra_cost->alu.extend_arith;
6407
6408             op1 = aarch64_strip_extend (op1);
6409             *cost += rtx_cost (op1, VOIDmode,
6410                                (enum rtx_code) GET_CODE (op1), 0, speed);
6411             return true;
6412           }
6413
6414         rtx new_op1 = aarch64_strip_extend (op1);
6415
6416         /* Cost this as an FMA-alike operation.  */
6417         if ((GET_CODE (new_op1) == MULT
6418              || aarch64_shift_p (GET_CODE (new_op1)))
6419             && code != COMPARE)
6420           {
6421             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6422                                             (enum rtx_code) code,
6423                                             speed);
6424             return true;
6425           }
6426
6427         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6428
6429         if (speed)
6430           {
6431             if (VECTOR_MODE_P (mode))
6432               {
6433                 /* Vector SUB.  */
6434                 *cost += extra_cost->vect.alu;
6435               }
6436             else if (GET_MODE_CLASS (mode) == MODE_INT)
6437               {
6438                 /* SUB(S).  */
6439                 *cost += extra_cost->alu.arith;
6440               }
6441             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6442               {
6443                 /* FSUB.  */
6444                 *cost += extra_cost->fp[mode == DFmode].addsub;
6445               }
6446           }
6447         return true;
6448       }
6449
6450     case PLUS:
6451       {
6452         rtx new_op0;
6453
6454         op0 = XEXP (x, 0);
6455         op1 = XEXP (x, 1);
6456
6457 cost_plus:
6458         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6459             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6460           {
6461             /* CSINC.  */
6462             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6463             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6464             return true;
6465           }
6466
6467         if (GET_MODE_CLASS (mode) == MODE_INT
6468             && CONST_INT_P (op1)
6469             && aarch64_uimm12_shift (INTVAL (op1)))
6470           {
6471             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6472
6473             if (speed)
6474               /* ADD (immediate).  */
6475               *cost += extra_cost->alu.arith;
6476             return true;
6477           }
6478
6479         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6480
6481         /* Look for ADD (extended register).  */
6482         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6483           {
6484             if (speed)
6485               *cost += extra_cost->alu.extend_arith;
6486
6487             op0 = aarch64_strip_extend (op0);
6488             *cost += rtx_cost (op0, VOIDmode,
6489                                (enum rtx_code) GET_CODE (op0), 0, speed);
6490             return true;
6491           }
6492
6493         /* Strip any extend, leave shifts behind as we will
6494            cost them through mult_cost.  */
6495         new_op0 = aarch64_strip_extend (op0);
6496
6497         if (GET_CODE (new_op0) == MULT
6498             || aarch64_shift_p (GET_CODE (new_op0)))
6499           {
6500             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6501                                             speed);
6502             return true;
6503           }
6504
6505         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6506
6507         if (speed)
6508           {
6509             if (VECTOR_MODE_P (mode))
6510               {
6511                 /* Vector ADD.  */
6512                 *cost += extra_cost->vect.alu;
6513               }
6514             else if (GET_MODE_CLASS (mode) == MODE_INT)
6515               {
6516                 /* ADD.  */
6517                 *cost += extra_cost->alu.arith;
6518               }
6519             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6520               {
6521                 /* FADD.  */
6522                 *cost += extra_cost->fp[mode == DFmode].addsub;
6523               }
6524           }
6525         return true;
6526       }
6527
6528     case BSWAP:
6529       *cost = COSTS_N_INSNS (1);
6530
6531       if (speed)
6532         {
6533           if (VECTOR_MODE_P (mode))
6534             *cost += extra_cost->vect.alu;
6535           else
6536             *cost += extra_cost->alu.rev;
6537         }
6538       return false;
6539
6540     case IOR:
6541       if (aarch_rev16_p (x))
6542         {
6543           *cost = COSTS_N_INSNS (1);
6544
6545           if (speed)
6546             {
6547               if (VECTOR_MODE_P (mode))
6548                 *cost += extra_cost->vect.alu;
6549               else
6550                 *cost += extra_cost->alu.rev;
6551             }
6552           return true;
6553         }
6554
6555       if (aarch64_extr_rtx_p (x, &op0, &op1))
6556         {
6557           *cost += rtx_cost (op0, mode, IOR, 0, speed);
6558           *cost += rtx_cost (op1, mode, IOR, 1, speed);
6559           if (speed)
6560             *cost += extra_cost->alu.shift;
6561
6562           return true;
6563         }
6564     /* Fall through.  */
6565     case XOR:
6566     case AND:
6567     cost_logic:
6568       op0 = XEXP (x, 0);
6569       op1 = XEXP (x, 1);
6570
6571       if (VECTOR_MODE_P (mode))
6572         {
6573           if (speed)
6574             *cost += extra_cost->vect.alu;
6575           return true;
6576         }
6577
6578       if (code == AND
6579           && GET_CODE (op0) == MULT
6580           && CONST_INT_P (XEXP (op0, 1))
6581           && CONST_INT_P (op1)
6582           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6583                                INTVAL (op1)) != 0)
6584         {
6585           /* This is a UBFM/SBFM.  */
6586           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6587           if (speed)
6588             *cost += extra_cost->alu.bfx;
6589           return true;
6590         }
6591
6592       if (GET_MODE_CLASS (mode) == MODE_INT)
6593         {
6594           /* We possibly get the immediate for free, this is not
6595              modelled.  */
6596           if (CONST_INT_P (op1)
6597               && aarch64_bitmask_imm (INTVAL (op1), mode))
6598             {
6599               *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6600
6601               if (speed)
6602                 *cost += extra_cost->alu.logical;
6603
6604               return true;
6605             }
6606           else
6607             {
6608               rtx new_op0 = op0;
6609
6610               /* Handle ORN, EON, or BIC.  */
6611               if (GET_CODE (op0) == NOT)
6612                 op0 = XEXP (op0, 0);
6613
6614               new_op0 = aarch64_strip_shift (op0);
6615
6616               /* If we had a shift on op0 then this is a logical-shift-
6617                  by-register/immediate operation.  Otherwise, this is just
6618                  a logical operation.  */
6619               if (speed)
6620                 {
6621                   if (new_op0 != op0)
6622                     {
6623                       /* Shift by immediate.  */
6624                       if (CONST_INT_P (XEXP (op0, 1)))
6625                         *cost += extra_cost->alu.log_shift;
6626                       else
6627                         *cost += extra_cost->alu.log_shift_reg;
6628                     }
6629                   else
6630                     *cost += extra_cost->alu.logical;
6631                 }
6632
6633               /* In both cases we want to cost both operands.  */
6634               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6635               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6636
6637               return true;
6638             }
6639         }
6640       return false;
6641
6642     case NOT:
6643       x = XEXP (x, 0);
6644       op0 = aarch64_strip_shift (x);
6645
6646       if (VECTOR_MODE_P (mode))
6647         {
6648           /* Vector NOT.  */
6649           *cost += extra_cost->vect.alu;
6650           return false;
6651         }
6652
6653       /* MVN-shifted-reg.  */
6654       if (op0 != x)
6655         {
6656           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6657
6658           if (speed)
6659             *cost += extra_cost->alu.log_shift;
6660
6661           return true;
6662         }
6663       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6664          Handle the second form here taking care that 'a' in the above can
6665          be a shift.  */
6666       else if (GET_CODE (op0) == XOR)
6667         {
6668           rtx newop0 = XEXP (op0, 0);
6669           rtx newop1 = XEXP (op0, 1);
6670           rtx op0_stripped = aarch64_strip_shift (newop0);
6671
6672           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6673           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6674
6675           if (speed)
6676             {
6677               if (op0_stripped != newop0)
6678                 *cost += extra_cost->alu.log_shift;
6679               else
6680                 *cost += extra_cost->alu.logical;
6681             }
6682
6683           return true;
6684         }
6685       /* MVN.  */
6686       if (speed)
6687         *cost += extra_cost->alu.logical;
6688
6689       return false;
6690
6691     case ZERO_EXTEND:
6692
6693       op0 = XEXP (x, 0);
6694       /* If a value is written in SI mode, then zero extended to DI
6695          mode, the operation will in general be free as a write to
6696          a 'w' register implicitly zeroes the upper bits of an 'x'
6697          register.  However, if this is
6698
6699            (set (reg) (zero_extend (reg)))
6700
6701          we must cost the explicit register move.  */
6702       if (mode == DImode
6703           && GET_MODE (op0) == SImode
6704           && outer == SET)
6705         {
6706           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6707
6708           if (!op_cost && speed)
6709             /* MOV.  */
6710             *cost += extra_cost->alu.extend;
6711           else
6712             /* Free, the cost is that of the SI mode operation.  */
6713             *cost = op_cost;
6714
6715           return true;
6716         }
6717       else if (MEM_P (op0))
6718         {
6719           /* All loads can zero extend to any size for free.  */
6720           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6721           return true;
6722         }
6723
6724       op0 = aarch64_extend_bitfield_pattern_p (x);
6725       if (op0)
6726         {
6727           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
6728           if (speed)
6729             *cost += extra_cost->alu.bfx;
6730           return true;
6731         }
6732
6733       if (speed)
6734         {
6735           if (VECTOR_MODE_P (mode))
6736             {
6737               /* UMOV.  */
6738               *cost += extra_cost->vect.alu;
6739             }
6740           else
6741             {
6742               /* UXTB/UXTH.  */
6743               *cost += extra_cost->alu.extend;
6744             }
6745         }
6746       return false;
6747
6748     case SIGN_EXTEND:
6749       if (MEM_P (XEXP (x, 0)))
6750         {
6751           /* LDRSH.  */
6752           if (speed)
6753             {
6754               rtx address = XEXP (XEXP (x, 0), 0);
6755               *cost += extra_cost->ldst.load_sign_extend;
6756
6757               *cost +=
6758                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6759                                                      0, speed));
6760             }
6761           return true;
6762         }
6763
6764       op0 = aarch64_extend_bitfield_pattern_p (x);
6765       if (op0)
6766         {
6767           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
6768           if (speed)
6769             *cost += extra_cost->alu.bfx;
6770           return true;
6771         }
6772
6773       if (speed)
6774         {
6775           if (VECTOR_MODE_P (mode))
6776             *cost += extra_cost->vect.alu;
6777           else
6778             *cost += extra_cost->alu.extend;
6779         }
6780       return false;
6781
6782     case ASHIFT:
6783       op0 = XEXP (x, 0);
6784       op1 = XEXP (x, 1);
6785
6786       if (CONST_INT_P (op1))
6787         {
6788           if (speed)
6789             {
6790               if (VECTOR_MODE_P (mode))
6791                 {
6792                   /* Vector shift (immediate).  */
6793                   *cost += extra_cost->vect.alu;
6794                 }
6795               else
6796                 {
6797                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6798                      aliases.  */
6799                   *cost += extra_cost->alu.shift;
6800                 }
6801             }
6802
6803           /* We can incorporate zero/sign extend for free.  */
6804           if (GET_CODE (op0) == ZERO_EXTEND
6805               || GET_CODE (op0) == SIGN_EXTEND)
6806             op0 = XEXP (op0, 0);
6807
6808           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
6809           return true;
6810         }
6811       else
6812         {
6813           if (speed)
6814             {
6815               if (VECTOR_MODE_P (mode))
6816                 {
6817                   /* Vector shift (register).  */
6818                   *cost += extra_cost->vect.alu;
6819                 }
6820               else
6821                 {
6822                   /* LSLV.  */
6823                   *cost += extra_cost->alu.shift_reg;
6824                 }
6825             }
6826           return false;  /* All arguments need to be in registers.  */
6827         }
6828
6829     case ROTATE:
6830     case ROTATERT:
6831     case LSHIFTRT:
6832     case ASHIFTRT:
6833       op0 = XEXP (x, 0);
6834       op1 = XEXP (x, 1);
6835
6836       if (CONST_INT_P (op1))
6837         {
6838           /* ASR (immediate) and friends.  */
6839           if (speed)
6840             {
6841               if (VECTOR_MODE_P (mode))
6842                 *cost += extra_cost->vect.alu;
6843               else
6844                 *cost += extra_cost->alu.shift;
6845             }
6846
6847           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6848           return true;
6849         }
6850       else
6851         {
6852
6853           /* ASR (register) and friends.  */
6854           if (speed)
6855             {
6856               if (VECTOR_MODE_P (mode))
6857                 *cost += extra_cost->vect.alu;
6858               else
6859                 *cost += extra_cost->alu.shift_reg;
6860             }
6861           return false;  /* All arguments need to be in registers.  */
6862         }
6863
6864     case SYMBOL_REF:
6865
6866       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
6867           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
6868         {
6869           /* LDR.  */
6870           if (speed)
6871             *cost += extra_cost->ldst.load;
6872         }
6873       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6874                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6875         {
6876           /* ADRP, followed by ADD.  */
6877           *cost += COSTS_N_INSNS (1);
6878           if (speed)
6879             *cost += 2 * extra_cost->alu.arith;
6880         }
6881       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6882                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6883         {
6884           /* ADR.  */
6885           if (speed)
6886             *cost += extra_cost->alu.arith;
6887         }
6888
6889       if (flag_pic)
6890         {
6891           /* One extra load instruction, after accessing the GOT.  */
6892           *cost += COSTS_N_INSNS (1);
6893           if (speed)
6894             *cost += extra_cost->ldst.load;
6895         }
6896       return true;
6897
6898     case HIGH:
6899     case LO_SUM:
6900       /* ADRP/ADD (immediate).  */
6901       if (speed)
6902         *cost += extra_cost->alu.arith;
6903       return true;
6904
6905     case ZERO_EXTRACT:
6906     case SIGN_EXTRACT:
6907       /* UBFX/SBFX.  */
6908       if (speed)
6909         {
6910           if (VECTOR_MODE_P (mode))
6911             *cost += extra_cost->vect.alu;
6912           else
6913             *cost += extra_cost->alu.bfx;
6914         }
6915
6916       /* We can trust that the immediates used will be correct (there
6917          are no by-register forms), so we need only cost op0.  */
6918       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
6919       return true;
6920
6921     case MULT:
6922       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6923       /* aarch64_rtx_mult_cost always handles recursion to its
6924          operands.  */
6925       return true;
6926
6927     case MOD:
6928     /* We can expand signed mod by power of 2 using a NEGS, two parallel
6929        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
6930        an unconditional negate.  This case should only ever be reached through
6931        the set_smod_pow2_cheap check in expmed.c.  */
6932       if (CONST_INT_P (XEXP (x, 1))
6933           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
6934           && (mode == SImode || mode == DImode))
6935         {
6936           /* We expand to 4 instructions.  Reset the baseline.  */
6937           *cost = COSTS_N_INSNS (4);
6938
6939           if (speed)
6940             *cost += 2 * extra_cost->alu.logical
6941                      + 2 * extra_cost->alu.arith;
6942
6943           return true;
6944         }
6945
6946     /* Fall-through.  */
6947     case UMOD:
6948       if (speed)
6949         {
6950           if (VECTOR_MODE_P (mode))
6951             *cost += extra_cost->vect.alu;
6952           else if (GET_MODE_CLASS (mode) == MODE_INT)
6953             *cost += (extra_cost->mult[mode == DImode].add
6954                       + extra_cost->mult[mode == DImode].idiv);
6955           else if (mode == DFmode)
6956             *cost += (extra_cost->fp[1].mult
6957                       + extra_cost->fp[1].div);
6958           else if (mode == SFmode)
6959             *cost += (extra_cost->fp[0].mult
6960                       + extra_cost->fp[0].div);
6961         }
6962       return false;  /* All arguments need to be in registers.  */
6963
6964     case DIV:
6965     case UDIV:
6966     case SQRT:
6967       if (speed)
6968         {
6969           if (VECTOR_MODE_P (mode))
6970             *cost += extra_cost->vect.alu;
6971           else if (GET_MODE_CLASS (mode) == MODE_INT)
6972             /* There is no integer SQRT, so only DIV and UDIV can get
6973                here.  */
6974             *cost += extra_cost->mult[mode == DImode].idiv;
6975           else
6976             *cost += extra_cost->fp[mode == DFmode].div;
6977         }
6978       return false;  /* All arguments need to be in registers.  */
6979
6980     case IF_THEN_ELSE:
6981       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6982                                          XEXP (x, 2), cost, speed);
6983
6984     case EQ:
6985     case NE:
6986     case GT:
6987     case GTU:
6988     case LT:
6989     case LTU:
6990     case GE:
6991     case GEU:
6992     case LE:
6993     case LEU:
6994
6995       return false; /* All arguments must be in registers.  */
6996
6997     case FMA:
6998       op0 = XEXP (x, 0);
6999       op1 = XEXP (x, 1);
7000       op2 = XEXP (x, 2);
7001
7002       if (speed)
7003         {
7004           if (VECTOR_MODE_P (mode))
7005             *cost += extra_cost->vect.alu;
7006           else
7007             *cost += extra_cost->fp[mode == DFmode].fma;
7008         }
7009
7010       /* FMSUB, FNMADD, and FNMSUB are free.  */
7011       if (GET_CODE (op0) == NEG)
7012         op0 = XEXP (op0, 0);
7013
7014       if (GET_CODE (op2) == NEG)
7015         op2 = XEXP (op2, 0);
7016
7017       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7018          and the by-element operand as operand 0.  */
7019       if (GET_CODE (op1) == NEG)
7020         op1 = XEXP (op1, 0);
7021
7022       /* Catch vector-by-element operations.  The by-element operand can
7023          either be (vec_duplicate (vec_select (x))) or just
7024          (vec_select (x)), depending on whether we are multiplying by
7025          a vector or a scalar.
7026
7027          Canonicalization is not very good in these cases, FMA4 will put the
7028          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7029       if (GET_CODE (op0) == VEC_DUPLICATE)
7030         op0 = XEXP (op0, 0);
7031       else if (GET_CODE (op1) == VEC_DUPLICATE)
7032         op1 = XEXP (op1, 0);
7033
7034       if (GET_CODE (op0) == VEC_SELECT)
7035         op0 = XEXP (op0, 0);
7036       else if (GET_CODE (op1) == VEC_SELECT)
7037         op1 = XEXP (op1, 0);
7038
7039       /* If the remaining parameters are not registers,
7040          get the cost to put them into registers.  */
7041       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7042       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7043       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7044       return true;
7045
7046     case FLOAT:
7047     case UNSIGNED_FLOAT:
7048       if (speed)
7049         *cost += extra_cost->fp[mode == DFmode].fromint;
7050       return false;
7051
7052     case FLOAT_EXTEND:
7053       if (speed)
7054         {
7055           if (VECTOR_MODE_P (mode))
7056             {
7057               /*Vector truncate.  */
7058               *cost += extra_cost->vect.alu;
7059             }
7060           else
7061             *cost += extra_cost->fp[mode == DFmode].widen;
7062         }
7063       return false;
7064
7065     case FLOAT_TRUNCATE:
7066       if (speed)
7067         {
7068           if (VECTOR_MODE_P (mode))
7069             {
7070               /*Vector conversion.  */
7071               *cost += extra_cost->vect.alu;
7072             }
7073           else
7074             *cost += extra_cost->fp[mode == DFmode].narrow;
7075         }
7076       return false;
7077
7078     case FIX:
7079     case UNSIGNED_FIX:
7080       x = XEXP (x, 0);
7081       /* Strip the rounding part.  They will all be implemented
7082          by the fcvt* family of instructions anyway.  */
7083       if (GET_CODE (x) == UNSPEC)
7084         {
7085           unsigned int uns_code = XINT (x, 1);
7086
7087           if (uns_code == UNSPEC_FRINTA
7088               || uns_code == UNSPEC_FRINTM
7089               || uns_code == UNSPEC_FRINTN
7090               || uns_code == UNSPEC_FRINTP
7091               || uns_code == UNSPEC_FRINTZ)
7092             x = XVECEXP (x, 0, 0);
7093         }
7094
7095       if (speed)
7096         {
7097           if (VECTOR_MODE_P (mode))
7098             *cost += extra_cost->vect.alu;
7099           else
7100             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7101         }
7102
7103       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7104          fixed-point fcvt.  */
7105       if (GET_CODE (x) == MULT
7106           && ((VECTOR_MODE_P (mode)
7107                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7108               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7109         {
7110           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7111                              0, speed);
7112           return true;
7113         }
7114
7115       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7116       return true;
7117
7118     case ABS:
7119       if (VECTOR_MODE_P (mode))
7120         {
7121           /* ABS (vector).  */
7122           if (speed)
7123             *cost += extra_cost->vect.alu;
7124         }
7125       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7126         {
7127           op0 = XEXP (x, 0);
7128
7129           /* FABD, which is analogous to FADD.  */
7130           if (GET_CODE (op0) == MINUS)
7131             {
7132               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7133               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7134               if (speed)
7135                 *cost += extra_cost->fp[mode == DFmode].addsub;
7136
7137               return true;
7138             }
7139           /* Simple FABS is analogous to FNEG.  */
7140           if (speed)
7141             *cost += extra_cost->fp[mode == DFmode].neg;
7142         }
7143       else
7144         {
7145           /* Integer ABS will either be split to
7146              two arithmetic instructions, or will be an ABS
7147              (scalar), which we don't model.  */
7148           *cost = COSTS_N_INSNS (2);
7149           if (speed)
7150             *cost += 2 * extra_cost->alu.arith;
7151         }
7152       return false;
7153
7154     case SMAX:
7155     case SMIN:
7156       if (speed)
7157         {
7158           if (VECTOR_MODE_P (mode))
7159             *cost += extra_cost->vect.alu;
7160           else
7161             {
7162               /* FMAXNM/FMINNM/FMAX/FMIN.
7163                  TODO: This may not be accurate for all implementations, but
7164                  we do not model this in the cost tables.  */
7165               *cost += extra_cost->fp[mode == DFmode].addsub;
7166             }
7167         }
7168       return false;
7169
7170     case UNSPEC:
7171       /* The floating point round to integer frint* instructions.  */
7172       if (aarch64_frint_unspec_p (XINT (x, 1)))
7173         {
7174           if (speed)
7175             *cost += extra_cost->fp[mode == DFmode].roundint;
7176
7177           return false;
7178         }
7179
7180       if (XINT (x, 1) == UNSPEC_RBIT)
7181         {
7182           if (speed)
7183             *cost += extra_cost->alu.rev;
7184
7185           return false;
7186         }
7187       break;
7188
7189     case TRUNCATE:
7190
7191       /* Decompose <su>muldi3_highpart.  */
7192       if (/* (truncate:DI  */
7193           mode == DImode
7194           /*   (lshiftrt:TI  */
7195           && GET_MODE (XEXP (x, 0)) == TImode
7196           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7197           /*      (mult:TI  */
7198           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7199           /*        (ANY_EXTEND:TI (reg:DI))
7200                     (ANY_EXTEND:TI (reg:DI)))  */
7201           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7202                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7203               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7204                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7205           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7206           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7207           /*     (const_int 64)  */
7208           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7209           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7210         {
7211           /* UMULH/SMULH.  */
7212           if (speed)
7213             *cost += extra_cost->mult[mode == DImode].extend;
7214           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7215                              mode, MULT, 0, speed);
7216           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7217                              mode, MULT, 1, speed);
7218           return true;
7219         }
7220
7221       /* Fall through.  */
7222     default:
7223       break;
7224     }
7225
7226   if (dump_file && (dump_flags & TDF_DETAILS))
7227     fprintf (dump_file,
7228       "\nFailed to cost RTX.  Assuming default cost.\n");
7229
7230   return true;
7231 }
7232
7233 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7234    calculated for X.  This cost is stored in *COST.  Returns true
7235    if the total cost of X was calculated.  */
7236 static bool
7237 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7238                    int param, int *cost, bool speed)
7239 {
7240   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7241
7242   if (dump_file && (dump_flags & TDF_DETAILS))
7243     {
7244       print_rtl_single (dump_file, x);
7245       fprintf (dump_file, "\n%s cost: %d (%s)\n",
7246                speed ? "Hot" : "Cold",
7247                *cost, result ? "final" : "partial");
7248     }
7249
7250   return result;
7251 }
7252
7253 static int
7254 aarch64_register_move_cost (machine_mode mode,
7255                             reg_class_t from_i, reg_class_t to_i)
7256 {
7257   enum reg_class from = (enum reg_class) from_i;
7258   enum reg_class to = (enum reg_class) to_i;
7259   const struct cpu_regmove_cost *regmove_cost
7260     = aarch64_tune_params.regmove_cost;
7261
7262   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
7263   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7264     to = GENERAL_REGS;
7265
7266   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7267     from = GENERAL_REGS;
7268
7269   /* Moving between GPR and stack cost is the same as GP2GP.  */
7270   if ((from == GENERAL_REGS && to == STACK_REG)
7271       || (to == GENERAL_REGS && from == STACK_REG))
7272     return regmove_cost->GP2GP;
7273
7274   /* To/From the stack register, we move via the gprs.  */
7275   if (to == STACK_REG || from == STACK_REG)
7276     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7277             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7278
7279   if (GET_MODE_SIZE (mode) == 16)
7280     {
7281       /* 128-bit operations on general registers require 2 instructions.  */
7282       if (from == GENERAL_REGS && to == GENERAL_REGS)
7283         return regmove_cost->GP2GP * 2;
7284       else if (from == GENERAL_REGS)
7285         return regmove_cost->GP2FP * 2;
7286       else if (to == GENERAL_REGS)
7287         return regmove_cost->FP2GP * 2;
7288
7289       /* When AdvSIMD instructions are disabled it is not possible to move
7290          a 128-bit value directly between Q registers.  This is handled in
7291          secondary reload.  A general register is used as a scratch to move
7292          the upper DI value and the lower DI value is moved directly,
7293          hence the cost is the sum of three moves. */
7294       if (! TARGET_SIMD)
7295         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7296
7297       return regmove_cost->FP2FP;
7298     }
7299
7300   if (from == GENERAL_REGS && to == GENERAL_REGS)
7301     return regmove_cost->GP2GP;
7302   else if (from == GENERAL_REGS)
7303     return regmove_cost->GP2FP;
7304   else if (to == GENERAL_REGS)
7305     return regmove_cost->FP2GP;
7306
7307   return regmove_cost->FP2FP;
7308 }
7309
7310 static int
7311 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7312                           reg_class_t rclass ATTRIBUTE_UNUSED,
7313                           bool in ATTRIBUTE_UNUSED)
7314 {
7315   return aarch64_tune_params.memmov_cost;
7316 }
7317
7318 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7319    to optimize 1.0/sqrt.  */
7320
7321 static bool
7322 use_rsqrt_p (void)
7323 {
7324   return (!flag_trapping_math
7325           && flag_unsafe_math_optimizations
7326           && ((aarch64_tune_params.extra_tuning_flags
7327                & AARCH64_EXTRA_TUNE_APPROX_RSQRT)
7328               || flag_mrecip_low_precision_sqrt));
7329 }
7330
7331 /* Function to decide when to use the approximate reciprocal square root
7332    builtin.  */
7333
7334 static tree
7335 aarch64_builtin_reciprocal (tree fndecl)
7336 {
7337   if (!use_rsqrt_p ())
7338     return NULL_TREE;
7339   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7340 }
7341
7342 typedef rtx (*rsqrte_type) (rtx, rtx);
7343
7344 /* Select reciprocal square root initial estimate
7345    insn depending on machine mode.  */
7346
7347 rsqrte_type
7348 get_rsqrte_type (machine_mode mode)
7349 {
7350   switch (mode)
7351   {
7352     case DFmode:   return gen_aarch64_rsqrte_df2;
7353     case SFmode:   return gen_aarch64_rsqrte_sf2;
7354     case V2DFmode: return gen_aarch64_rsqrte_v2df2;
7355     case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
7356     case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
7357     default: gcc_unreachable ();
7358   }
7359 }
7360
7361 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7362
7363 /* Select reciprocal square root Newton-Raphson step
7364    insn depending on machine mode.  */
7365
7366 rsqrts_type
7367 get_rsqrts_type (machine_mode mode)
7368 {
7369   switch (mode)
7370   {
7371     case DFmode:   return gen_aarch64_rsqrts_df3;
7372     case SFmode:   return gen_aarch64_rsqrts_sf3;
7373     case V2DFmode: return gen_aarch64_rsqrts_v2df3;
7374     case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
7375     case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
7376     default: gcc_unreachable ();
7377   }
7378 }
7379
7380 /* Emit instruction sequence to compute the reciprocal square root using the
7381    Newton-Raphson series.  Iterate over the series twice for SF
7382    and thrice for DF.  */
7383
7384 void
7385 aarch64_emit_approx_rsqrt (rtx dst, rtx src)
7386 {
7387   machine_mode mode = GET_MODE (src);
7388   gcc_assert (
7389     mode == SFmode || mode == V2SFmode || mode == V4SFmode
7390         || mode == DFmode || mode == V2DFmode);
7391
7392   rtx xsrc = gen_reg_rtx (mode);
7393   emit_move_insn (xsrc, src);
7394   rtx x0 = gen_reg_rtx (mode);
7395
7396   emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
7397
7398   bool double_mode = (mode == DFmode || mode == V2DFmode);
7399
7400   int iterations = double_mode ? 3 : 2;
7401
7402   /* Optionally iterate over the series one less time than otherwise.  */
7403   if (flag_mrecip_low_precision_sqrt)
7404     iterations--;
7405
7406   for (int i = 0; i < iterations; ++i)
7407     {
7408       rtx x1 = gen_reg_rtx (mode);
7409       rtx x2 = gen_reg_rtx (mode);
7410       rtx x3 = gen_reg_rtx (mode);
7411       emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
7412
7413       emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
7414
7415       emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
7416       x0 = x1;
7417     }
7418
7419   emit_move_insn (dst, x0);
7420 }
7421
7422 /* Return the number of instructions that can be issued per cycle.  */
7423 static int
7424 aarch64_sched_issue_rate (void)
7425 {
7426   return aarch64_tune_params.issue_rate;
7427 }
7428
7429 static int
7430 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7431 {
7432   int issue_rate = aarch64_sched_issue_rate ();
7433
7434   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7435 }
7436
7437
7438 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7439    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
7440    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
7441
7442 static int
7443 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
7444                                                     int ready_index)
7445 {
7446   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
7447 }
7448
7449
7450 /* Vectorizer cost model target hooks.  */
7451
7452 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
7453 static int
7454 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7455                                     tree vectype,
7456                                     int misalign ATTRIBUTE_UNUSED)
7457 {
7458   unsigned elements;
7459
7460   switch (type_of_cost)
7461     {
7462       case scalar_stmt:
7463         return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7464
7465       case scalar_load:
7466         return aarch64_tune_params.vec_costs->scalar_load_cost;
7467
7468       case scalar_store:
7469         return aarch64_tune_params.vec_costs->scalar_store_cost;
7470
7471       case vector_stmt:
7472         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7473
7474       case vector_load:
7475         return aarch64_tune_params.vec_costs->vec_align_load_cost;
7476
7477       case vector_store:
7478         return aarch64_tune_params.vec_costs->vec_store_cost;
7479
7480       case vec_to_scalar:
7481         return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7482
7483       case scalar_to_vec:
7484         return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7485
7486       case unaligned_load:
7487         return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7488
7489       case unaligned_store:
7490         return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7491
7492       case cond_branch_taken:
7493         return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7494
7495       case cond_branch_not_taken:
7496         return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7497
7498       case vec_perm:
7499         return aarch64_tune_params.vec_costs->vec_permute_cost;
7500
7501       case vec_promote_demote:
7502         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7503
7504       case vec_construct:
7505         elements = TYPE_VECTOR_SUBPARTS (vectype);
7506         return elements / 2 + 1;
7507
7508       default:
7509         gcc_unreachable ();
7510     }
7511 }
7512
7513 /* Implement targetm.vectorize.add_stmt_cost.  */
7514 static unsigned
7515 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7516                        struct _stmt_vec_info *stmt_info, int misalign,
7517                        enum vect_cost_model_location where)
7518 {
7519   unsigned *cost = (unsigned *) data;
7520   unsigned retval = 0;
7521
7522   if (flag_vect_cost_model)
7523     {
7524       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7525       int stmt_cost =
7526             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7527
7528       /* Statements in an inner loop relative to the loop being
7529          vectorized are weighted more heavily.  The value here is
7530          arbitrary and could potentially be improved with analysis.  */
7531       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7532         count *= 50; /*  FIXME  */
7533
7534       retval = (unsigned) (count * stmt_cost);
7535       cost[where] += retval;
7536     }
7537
7538   return retval;
7539 }
7540
7541 static void initialize_aarch64_code_model (struct gcc_options *);
7542
7543 /* Parse the TO_PARSE string and put the architecture struct that it
7544    selects into RES and the architectural features into ISA_FLAGS.
7545    Return an aarch64_parse_opt_result describing the parse result.
7546    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
7547
7548 static enum aarch64_parse_opt_result
7549 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7550                     unsigned long *isa_flags)
7551 {
7552   char *ext;
7553   const struct processor *arch;
7554   char *str = (char *) alloca (strlen (to_parse) + 1);
7555   size_t len;
7556
7557   strcpy (str, to_parse);
7558
7559   ext = strchr (str, '+');
7560
7561   if (ext != NULL)
7562     len = ext - str;
7563   else
7564     len = strlen (str);
7565
7566   if (len == 0)
7567     return AARCH64_PARSE_MISSING_ARG;
7568
7569
7570   /* Loop through the list of supported ARCHes to find a match.  */
7571   for (arch = all_architectures; arch->name != NULL; arch++)
7572     {
7573       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7574         {
7575           unsigned long isa_temp = arch->flags;
7576
7577           if (ext != NULL)
7578             {
7579               /* TO_PARSE string contains at least one extension.  */
7580               enum aarch64_parse_opt_result ext_res
7581                 = aarch64_parse_extension (ext, &isa_temp);
7582
7583               if (ext_res != AARCH64_PARSE_OK)
7584                 return ext_res;
7585             }
7586           /* Extension parsing was successful.  Confirm the result
7587              arch and ISA flags.  */
7588           *res = arch;
7589           *isa_flags = isa_temp;
7590           return AARCH64_PARSE_OK;
7591         }
7592     }
7593
7594   /* ARCH name not found in list.  */
7595   return AARCH64_PARSE_INVALID_ARG;
7596 }
7597
7598 /* Parse the TO_PARSE string and put the result tuning in RES and the
7599    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
7600    describing the parse result.  If there is an error parsing, RES and
7601    ISA_FLAGS are left unchanged.  */
7602
7603 static enum aarch64_parse_opt_result
7604 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7605                    unsigned long *isa_flags)
7606 {
7607   char *ext;
7608   const struct processor *cpu;
7609   char *str = (char *) alloca (strlen (to_parse) + 1);
7610   size_t len;
7611
7612   strcpy (str, to_parse);
7613
7614   ext = strchr (str, '+');
7615
7616   if (ext != NULL)
7617     len = ext - str;
7618   else
7619     len = strlen (str);
7620
7621   if (len == 0)
7622     return AARCH64_PARSE_MISSING_ARG;
7623
7624
7625   /* Loop through the list of supported CPUs to find a match.  */
7626   for (cpu = all_cores; cpu->name != NULL; cpu++)
7627     {
7628       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7629         {
7630           unsigned long isa_temp = cpu->flags;
7631
7632
7633           if (ext != NULL)
7634             {
7635               /* TO_PARSE string contains at least one extension.  */
7636               enum aarch64_parse_opt_result ext_res
7637                 = aarch64_parse_extension (ext, &isa_temp);
7638
7639               if (ext_res != AARCH64_PARSE_OK)
7640                 return ext_res;
7641             }
7642           /* Extension parsing was successfull.  Confirm the result
7643              cpu and ISA flags.  */
7644           *res = cpu;
7645           *isa_flags = isa_temp;
7646           return AARCH64_PARSE_OK;
7647         }
7648     }
7649
7650   /* CPU name not found in list.  */
7651   return AARCH64_PARSE_INVALID_ARG;
7652 }
7653
7654 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7655    Return an aarch64_parse_opt_result describing the parse result.
7656    If the parsing fails the RES does not change.  */
7657
7658 static enum aarch64_parse_opt_result
7659 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7660 {
7661   const struct processor *cpu;
7662   char *str = (char *) alloca (strlen (to_parse) + 1);
7663
7664   strcpy (str, to_parse);
7665
7666   /* Loop through the list of supported CPUs to find a match.  */
7667   for (cpu = all_cores; cpu->name != NULL; cpu++)
7668     {
7669       if (strcmp (cpu->name, str) == 0)
7670         {
7671           *res = cpu;
7672           return AARCH64_PARSE_OK;
7673         }
7674     }
7675
7676   /* CPU name not found in list.  */
7677   return AARCH64_PARSE_INVALID_ARG;
7678 }
7679
7680 /* Parse TOKEN, which has length LENGTH to see if it is an option
7681    described in FLAG.  If it is, return the index bit for that fusion type.
7682    If not, error (printing OPTION_NAME) and return zero.  */
7683
7684 static unsigned int
7685 aarch64_parse_one_option_token (const char *token,
7686                                 size_t length,
7687                                 const struct aarch64_flag_desc *flag,
7688                                 const char *option_name)
7689 {
7690   for (; flag->name != NULL; flag++)
7691     {
7692       if (length == strlen (flag->name)
7693           && !strncmp (flag->name, token, length))
7694         return flag->flag;
7695     }
7696
7697   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7698   return 0;
7699 }
7700
7701 /* Parse OPTION which is a comma-separated list of flags to enable.
7702    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7703    default state we inherit from the CPU tuning structures.  OPTION_NAME
7704    gives the top-level option we are parsing in the -moverride string,
7705    for use in error messages.  */
7706
7707 static unsigned int
7708 aarch64_parse_boolean_options (const char *option,
7709                                const struct aarch64_flag_desc *flags,
7710                                unsigned int initial_state,
7711                                const char *option_name)
7712 {
7713   const char separator = '.';
7714   const char* specs = option;
7715   const char* ntoken = option;
7716   unsigned int found_flags = initial_state;
7717
7718   while ((ntoken = strchr (specs, separator)))
7719     {
7720       size_t token_length = ntoken - specs;
7721       unsigned token_ops = aarch64_parse_one_option_token (specs,
7722                                                            token_length,
7723                                                            flags,
7724                                                            option_name);
7725       /* If we find "none" (or, for simplicity's sake, an error) anywhere
7726          in the token stream, reset the supported operations.  So:
7727
7728            adrp+add.cmp+branch.none.adrp+add
7729
7730            would have the result of turning on only adrp+add fusion.  */
7731       if (!token_ops)
7732         found_flags = 0;
7733
7734       found_flags |= token_ops;
7735       specs = ++ntoken;
7736     }
7737
7738   /* We ended with a comma, print something.  */
7739   if (!(*specs))
7740     {
7741       error ("%s string ill-formed\n", option_name);
7742       return 0;
7743     }
7744
7745   /* We still have one more token to parse.  */
7746   size_t token_length = strlen (specs);
7747   unsigned token_ops = aarch64_parse_one_option_token (specs,
7748                                                        token_length,
7749                                                        flags,
7750                                                        option_name);
7751    if (!token_ops)
7752      found_flags = 0;
7753
7754   found_flags |= token_ops;
7755   return found_flags;
7756 }
7757
7758 /* Support for overriding instruction fusion.  */
7759
7760 static void
7761 aarch64_parse_fuse_string (const char *fuse_string,
7762                             struct tune_params *tune)
7763 {
7764   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
7765                                                      aarch64_fusible_pairs,
7766                                                      tune->fusible_ops,
7767                                                      "fuse=");
7768 }
7769
7770 /* Support for overriding other tuning flags.  */
7771
7772 static void
7773 aarch64_parse_tune_string (const char *tune_string,
7774                             struct tune_params *tune)
7775 {
7776   tune->extra_tuning_flags
7777     = aarch64_parse_boolean_options (tune_string,
7778                                      aarch64_tuning_flags,
7779                                      tune->extra_tuning_flags,
7780                                      "tune=");
7781 }
7782
7783 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
7784    we understand.  If it is, extract the option string and handoff to
7785    the appropriate function.  */
7786
7787 void
7788 aarch64_parse_one_override_token (const char* token,
7789                                   size_t length,
7790                                   struct tune_params *tune)
7791 {
7792   const struct aarch64_tuning_override_function *fn
7793     = aarch64_tuning_override_functions;
7794
7795   const char *option_part = strchr (token, '=');
7796   if (!option_part)
7797     {
7798       error ("tuning string missing in option (%s)", token);
7799       return;
7800     }
7801
7802   /* Get the length of the option name.  */
7803   length = option_part - token;
7804   /* Skip the '=' to get to the option string.  */
7805   option_part++;
7806
7807   for (; fn->name != NULL; fn++)
7808     {
7809       if (!strncmp (fn->name, token, length))
7810         {
7811           fn->parse_override (option_part, tune);
7812           return;
7813         }
7814     }
7815
7816   error ("unknown tuning option (%s)",token);
7817   return;
7818 }
7819
7820 /* A checking mechanism for the implementation of the tls size.  */
7821
7822 static void
7823 initialize_aarch64_tls_size (struct gcc_options *opts)
7824 {
7825   if (aarch64_tls_size == 0)
7826     aarch64_tls_size = 24;
7827
7828   switch (opts->x_aarch64_cmodel_var)
7829     {
7830     case AARCH64_CMODEL_TINY:
7831       /* Both the default and maximum TLS size allowed under tiny is 1M which
7832          needs two instructions to address, so we clamp the size to 24.  */
7833       if (aarch64_tls_size > 24)
7834         aarch64_tls_size = 24;
7835       break;
7836     case AARCH64_CMODEL_SMALL:
7837       /* The maximum TLS size allowed under small is 4G.  */
7838       if (aarch64_tls_size > 32)
7839         aarch64_tls_size = 32;
7840       break;
7841     case AARCH64_CMODEL_LARGE:
7842       /* The maximum TLS size allowed under large is 16E.
7843          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
7844       if (aarch64_tls_size > 48)
7845         aarch64_tls_size = 48;
7846       break;
7847     default:
7848       gcc_unreachable ();
7849     }
7850
7851   return;
7852 }
7853
7854 /* Parse STRING looking for options in the format:
7855      string     :: option:string
7856      option     :: name=substring
7857      name       :: {a-z}
7858      substring  :: defined by option.  */
7859
7860 static void
7861 aarch64_parse_override_string (const char* input_string,
7862                                struct tune_params* tune)
7863 {
7864   const char separator = ':';
7865   size_t string_length = strlen (input_string) + 1;
7866   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
7867   char *string = string_root;
7868   strncpy (string, input_string, string_length);
7869   string[string_length - 1] = '\0';
7870
7871   char* ntoken = string;
7872
7873   while ((ntoken = strchr (string, separator)))
7874     {
7875       size_t token_length = ntoken - string;
7876       /* Make this substring look like a string.  */
7877       *ntoken = '\0';
7878       aarch64_parse_one_override_token (string, token_length, tune);
7879       string = ++ntoken;
7880     }
7881
7882   /* One last option to parse.  */
7883   aarch64_parse_one_override_token (string, strlen (string), tune);
7884   free (string_root);
7885 }
7886
7887
7888 static void
7889 aarch64_override_options_after_change_1 (struct gcc_options *opts)
7890 {
7891   /* The logic here is that if we are disabling all frame pointer generation
7892      then we do not need to disable leaf frame pointer generation as a
7893      separate operation.  But if we are *only* disabling leaf frame pointer
7894      generation then we set flag_omit_frame_pointer to true, but in
7895      aarch64_frame_pointer_required we return false only for leaf functions.
7896
7897      PR 70044: We have to be careful about being called multiple times for the
7898      same function.  Once we have decided to set flag_omit_frame_pointer just
7899      so that we can omit leaf frame pointers, we must then not interpret a
7900      second call as meaning that all frame pointer generation should be
7901      omitted.  We do this by setting flag_omit_frame_pointer to a special,
7902      non-zero value.  */
7903   if (opts->x_flag_omit_frame_pointer == 2)
7904     opts->x_flag_omit_frame_pointer = 0;
7905
7906   if (opts->x_flag_omit_frame_pointer)
7907     opts->x_flag_omit_leaf_frame_pointer = false;
7908   else if (opts->x_flag_omit_leaf_frame_pointer)
7909     opts->x_flag_omit_frame_pointer = 2;
7910
7911   /* If not optimizing for size, set the default
7912      alignment to what the target wants.  */
7913   if (!opts->x_optimize_size)
7914     {
7915       if (opts->x_align_loops <= 0)
7916         opts->x_align_loops = aarch64_tune_params.loop_align;
7917       if (opts->x_align_jumps <= 0)
7918         opts->x_align_jumps = aarch64_tune_params.jump_align;
7919       if (opts->x_align_functions <= 0)
7920         opts->x_align_functions = aarch64_tune_params.function_align;
7921     }
7922
7923   /* If nopcrelative_literal_loads is set on the command line, this
7924      implies that the user asked for PC relative literal loads.  */
7925   if (opts->x_nopcrelative_literal_loads == 1)
7926     aarch64_nopcrelative_literal_loads = false;
7927
7928   /* If it is not set on the command line, we default to no pc
7929      relative literal loads, unless the workaround for Cortex-A53
7930      erratum 843419 is in effect.  */
7931   /* This is PR70113. When building the Linux kernel with
7932      CONFIG_ARM64_ERRATUM_843419, support for relocations
7933      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
7934      removed from the kernel to avoid loading objects with possibly
7935      offending sequences. With nopcrelative_literal_loads, we would
7936      generate such relocations, preventing the kernel build from
7937      succeeding.  */
7938   if (opts->x_nopcrelative_literal_loads == 2
7939       && !TARGET_FIX_ERR_A53_843419)
7940     aarch64_nopcrelative_literal_loads = true;
7941
7942   /* In the tiny memory model it makes no sense
7943      to disallow non PC relative literal pool loads
7944      as many other things will break anyway.  */
7945   if (opts->x_nopcrelative_literal_loads
7946       && (aarch64_cmodel == AARCH64_CMODEL_TINY
7947           || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
7948     aarch64_nopcrelative_literal_loads = false;
7949 }
7950
7951 /* 'Unpack' up the internal tuning structs and update the options
7952     in OPTS.  The caller must have set up selected_tune and selected_arch
7953     as all the other target-specific codegen decisions are
7954     derived from them.  */
7955
7956 void
7957 aarch64_override_options_internal (struct gcc_options *opts)
7958 {
7959   aarch64_tune_flags = selected_tune->flags;
7960   aarch64_tune = selected_tune->sched_core;
7961   /* Make a copy of the tuning parameters attached to the core, which
7962      we may later overwrite.  */
7963   aarch64_tune_params = *(selected_tune->tune);
7964   aarch64_architecture_version = selected_arch->architecture_version;
7965
7966   if (opts->x_aarch64_override_tune_string)
7967     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
7968                                   &aarch64_tune_params);
7969
7970   /* This target defaults to strict volatile bitfields.  */
7971   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
7972     opts->x_flag_strict_volatile_bitfields = 1;
7973
7974   initialize_aarch64_code_model (opts);
7975   initialize_aarch64_tls_size (opts);
7976
7977   int queue_depth = 0;
7978   switch (aarch64_tune_params.autoprefetcher_model)
7979     {
7980       case tune_params::AUTOPREFETCHER_OFF:
7981         queue_depth = -1;
7982         break;
7983       case tune_params::AUTOPREFETCHER_WEAK:
7984         queue_depth = 0;
7985         break;
7986       case tune_params::AUTOPREFETCHER_STRONG:
7987         queue_depth = max_insn_queue_index + 1;
7988         break;
7989       default:
7990         gcc_unreachable ();
7991     }
7992
7993   /* We don't mind passing in global_options_set here as we don't use
7994      the *options_set structs anyway.  */
7995   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
7996                          queue_depth,
7997                          opts->x_param_values,
7998                          global_options_set.x_param_values);
7999
8000   /* Set the L1 cache line size.  */
8001   if (selected_cpu->tune->cache_line_size != 0)
8002     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8003                            selected_cpu->tune->cache_line_size,
8004                            opts->x_param_values,
8005                            global_options_set.x_param_values);
8006
8007   aarch64_override_options_after_change_1 (opts);
8008 }
8009
8010 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
8011    specified in STR and throw errors if appropriate.  Put the results if
8012    they are valid in RES and ISA_FLAGS.  Return whether the option is
8013    valid.  */
8014
8015 static bool
8016 aarch64_validate_mcpu (const char *str, const struct processor **res,
8017                        unsigned long *isa_flags)
8018 {
8019   enum aarch64_parse_opt_result parse_res
8020     = aarch64_parse_cpu (str, res, isa_flags);
8021
8022   if (parse_res == AARCH64_PARSE_OK)
8023     return true;
8024
8025   switch (parse_res)
8026     {
8027       case AARCH64_PARSE_MISSING_ARG:
8028         error ("missing cpu name in -mcpu=%qs", str);
8029         break;
8030       case AARCH64_PARSE_INVALID_ARG:
8031         error ("unknown value %qs for -mcpu", str);
8032         break;
8033       case AARCH64_PARSE_INVALID_FEATURE:
8034         error ("invalid feature modifier in -mcpu=%qs", str);
8035         break;
8036       default:
8037         gcc_unreachable ();
8038     }
8039
8040   return false;
8041 }
8042
8043 /* Validate a command-line -march option.  Parse the arch and extensions
8044    (if any) specified in STR and throw errors if appropriate.  Put the
8045    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
8046    option is valid.  */
8047
8048 static bool
8049 aarch64_validate_march (const char *str, const struct processor **res,
8050                        unsigned long *isa_flags)
8051 {
8052   enum aarch64_parse_opt_result parse_res
8053     = aarch64_parse_arch (str, res, isa_flags);
8054
8055   if (parse_res == AARCH64_PARSE_OK)
8056     return true;
8057
8058   switch (parse_res)
8059     {
8060       case AARCH64_PARSE_MISSING_ARG:
8061         error ("missing arch name in -march=%qs", str);
8062         break;
8063       case AARCH64_PARSE_INVALID_ARG:
8064         error ("unknown value %qs for -march", str);
8065         break;
8066       case AARCH64_PARSE_INVALID_FEATURE:
8067         error ("invalid feature modifier in -march=%qs", str);
8068         break;
8069       default:
8070         gcc_unreachable ();
8071     }
8072
8073   return false;
8074 }
8075
8076 /* Validate a command-line -mtune option.  Parse the cpu
8077    specified in STR and throw errors if appropriate.  Put the
8078    result, if it is valid, in RES.  Return whether the option is
8079    valid.  */
8080
8081 static bool
8082 aarch64_validate_mtune (const char *str, const struct processor **res)
8083 {
8084   enum aarch64_parse_opt_result parse_res
8085     = aarch64_parse_tune (str, res);
8086
8087   if (parse_res == AARCH64_PARSE_OK)
8088     return true;
8089
8090   switch (parse_res)
8091     {
8092       case AARCH64_PARSE_MISSING_ARG:
8093         error ("missing cpu name in -mtune=%qs", str);
8094         break;
8095       case AARCH64_PARSE_INVALID_ARG:
8096         error ("unknown value %qs for -mtune", str);
8097         break;
8098       default:
8099         gcc_unreachable ();
8100     }
8101   return false;
8102 }
8103
8104 /* Return the CPU corresponding to the enum CPU.
8105    If it doesn't specify a cpu, return the default.  */
8106
8107 static const struct processor *
8108 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8109 {
8110   if (cpu != aarch64_none)
8111     return &all_cores[cpu];
8112
8113   /* The & 0x3f is to extract the bottom 6 bits that encode the
8114      default cpu as selected by the --with-cpu GCC configure option
8115      in config.gcc.
8116      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8117      flags mechanism should be reworked to make it more sane.  */
8118   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8119 }
8120
8121 /* Return the architecture corresponding to the enum ARCH.
8122    If it doesn't specify a valid architecture, return the default.  */
8123
8124 static const struct processor *
8125 aarch64_get_arch (enum aarch64_arch arch)
8126 {
8127   if (arch != aarch64_no_arch)
8128     return &all_architectures[arch];
8129
8130   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8131
8132   return &all_architectures[cpu->arch];
8133 }
8134
8135 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
8136    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8137    tuning structs.  In particular it must set selected_tune and
8138    aarch64_isa_flags that define the available ISA features and tuning
8139    decisions.  It must also set selected_arch as this will be used to
8140    output the .arch asm tags for each function.  */
8141
8142 static void
8143 aarch64_override_options (void)
8144 {
8145   unsigned long cpu_isa = 0;
8146   unsigned long arch_isa = 0;
8147   aarch64_isa_flags = 0;
8148
8149   bool valid_cpu = true;
8150   bool valid_tune = true;
8151   bool valid_arch = true;
8152
8153   selected_cpu = NULL;
8154   selected_arch = NULL;
8155   selected_tune = NULL;
8156
8157   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8158      If either of -march or -mtune is given, they override their
8159      respective component of -mcpu.  */
8160   if (aarch64_cpu_string)
8161     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8162                                         &cpu_isa);
8163
8164   if (aarch64_arch_string)
8165     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8166                                           &arch_isa);
8167
8168   if (aarch64_tune_string)
8169     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8170
8171   /* If the user did not specify a processor, choose the default
8172      one for them.  This will be the CPU set during configuration using
8173      --with-cpu, otherwise it is "generic".  */
8174   if (!selected_cpu)
8175     {
8176       if (selected_arch)
8177         {
8178           selected_cpu = &all_cores[selected_arch->ident];
8179           aarch64_isa_flags = arch_isa;
8180           explicit_arch = selected_arch->arch;
8181         }
8182       else
8183         {
8184           /* Get default configure-time CPU.  */
8185           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8186           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8187         }
8188
8189       if (selected_tune)
8190         explicit_tune_core = selected_tune->ident;
8191     }
8192   /* If both -mcpu and -march are specified check that they are architecturally
8193      compatible, warn if they're not and prefer the -march ISA flags.  */
8194   else if (selected_arch)
8195     {
8196       if (selected_arch->arch != selected_cpu->arch)
8197         {
8198           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8199                        all_architectures[selected_cpu->arch].name,
8200                        selected_arch->name);
8201         }
8202       aarch64_isa_flags = arch_isa;
8203       explicit_arch = selected_arch->arch;
8204       explicit_tune_core = selected_tune ? selected_tune->ident
8205                                           : selected_cpu->ident;
8206     }
8207   else
8208     {
8209       /* -mcpu but no -march.  */
8210       aarch64_isa_flags = cpu_isa;
8211       explicit_tune_core = selected_tune ? selected_tune->ident
8212                                           : selected_cpu->ident;
8213       gcc_assert (selected_cpu);
8214       selected_arch = &all_architectures[selected_cpu->arch];
8215       explicit_arch = selected_arch->arch;
8216     }
8217
8218   /* Set the arch as well as we will need it when outputing
8219      the .arch directive in assembly.  */
8220   if (!selected_arch)
8221     {
8222       gcc_assert (selected_cpu);
8223       selected_arch = &all_architectures[selected_cpu->arch];
8224     }
8225
8226   if (!selected_tune)
8227     selected_tune = selected_cpu;
8228
8229 #ifndef HAVE_AS_MABI_OPTION
8230   /* The compiler may have been configured with 2.23.* binutils, which does
8231      not have support for ILP32.  */
8232   if (TARGET_ILP32)
8233     error ("Assembler does not support -mabi=ilp32");
8234 #endif
8235
8236   /* Make sure we properly set up the explicit options.  */
8237   if ((aarch64_cpu_string && valid_cpu)
8238        || (aarch64_tune_string && valid_tune))
8239     gcc_assert (explicit_tune_core != aarch64_none);
8240
8241   if ((aarch64_cpu_string && valid_cpu)
8242        || (aarch64_arch_string && valid_arch))
8243     gcc_assert (explicit_arch != aarch64_no_arch);
8244
8245   aarch64_override_options_internal (&global_options);
8246
8247   /* Save these options as the default ones in case we push and pop them later
8248      while processing functions with potential target attributes.  */
8249   target_option_default_node = target_option_current_node
8250       = build_target_option_node (&global_options);
8251
8252   aarch64_register_fma_steering ();
8253
8254 }
8255
8256 /* Implement targetm.override_options_after_change.  */
8257
8258 static void
8259 aarch64_override_options_after_change (void)
8260 {
8261   aarch64_override_options_after_change_1 (&global_options);
8262 }
8263
8264 static struct machine_function *
8265 aarch64_init_machine_status (void)
8266 {
8267   struct machine_function *machine;
8268   machine = ggc_cleared_alloc<machine_function> ();
8269   return machine;
8270 }
8271
8272 void
8273 aarch64_init_expanders (void)
8274 {
8275   init_machine_status = aarch64_init_machine_status;
8276 }
8277
8278 /* A checking mechanism for the implementation of the various code models.  */
8279 static void
8280 initialize_aarch64_code_model (struct gcc_options *opts)
8281 {
8282    if (opts->x_flag_pic)
8283      {
8284        switch (opts->x_aarch64_cmodel_var)
8285          {
8286          case AARCH64_CMODEL_TINY:
8287            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8288            break;
8289          case AARCH64_CMODEL_SMALL:
8290 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8291            aarch64_cmodel = (flag_pic == 2
8292                              ? AARCH64_CMODEL_SMALL_PIC
8293                              : AARCH64_CMODEL_SMALL_SPIC);
8294 #else
8295            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8296 #endif
8297            break;
8298          case AARCH64_CMODEL_LARGE:
8299            sorry ("code model %qs with -f%s", "large",
8300                   opts->x_flag_pic > 1 ? "PIC" : "pic");
8301            break;
8302          default:
8303            gcc_unreachable ();
8304          }
8305      }
8306    else
8307      aarch64_cmodel = opts->x_aarch64_cmodel_var;
8308 }
8309
8310 /* Implement TARGET_OPTION_SAVE.  */
8311
8312 static void
8313 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8314 {
8315   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8316 }
8317
8318 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
8319    using the information saved in PTR.  */
8320
8321 static void
8322 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8323 {
8324   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8325   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8326   opts->x_explicit_arch = ptr->x_explicit_arch;
8327   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8328   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8329
8330   aarch64_override_options_internal (opts);
8331 }
8332
8333 /* Implement TARGET_OPTION_PRINT.  */
8334
8335 static void
8336 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8337 {
8338   const struct processor *cpu
8339     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8340   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8341   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8342   std::string extension
8343     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
8344
8345   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8346   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8347            arch->name, extension.c_str ());
8348 }
8349
8350 static GTY(()) tree aarch64_previous_fndecl;
8351
8352 void
8353 aarch64_reset_previous_fndecl (void)
8354 {
8355   aarch64_previous_fndecl = NULL;
8356 }
8357
8358 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
8359    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
8360    make sure optab availability predicates are recomputed when necessary.  */
8361
8362 void
8363 aarch64_save_restore_target_globals (tree new_tree)
8364 {
8365   if (TREE_TARGET_GLOBALS (new_tree))
8366     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8367   else if (new_tree == target_option_default_node)
8368     restore_target_globals (&default_target_globals);
8369   else
8370     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
8371 }
8372
8373 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
8374    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8375    of the function, if such exists.  This function may be called multiple
8376    times on a single function so use aarch64_previous_fndecl to avoid
8377    setting up identical state.  */
8378
8379 static void
8380 aarch64_set_current_function (tree fndecl)
8381 {
8382   if (!fndecl || fndecl == aarch64_previous_fndecl)
8383     return;
8384
8385   tree old_tree = (aarch64_previous_fndecl
8386                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
8387                    : NULL_TREE);
8388
8389   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
8390
8391   /* If current function has no attributes but the previous one did,
8392      use the default node.  */
8393   if (!new_tree && old_tree)
8394     new_tree = target_option_default_node;
8395
8396   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
8397      the default have been handled by aarch64_save_restore_target_globals from
8398      aarch64_pragma_target_parse.  */
8399   if (old_tree == new_tree)
8400     return;
8401
8402   aarch64_previous_fndecl = fndecl;
8403
8404   /* First set the target options.  */
8405   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
8406
8407   aarch64_save_restore_target_globals (new_tree);
8408 }
8409
8410 /* Enum describing the various ways we can handle attributes.
8411    In many cases we can reuse the generic option handling machinery.  */
8412
8413 enum aarch64_attr_opt_type
8414 {
8415   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
8416   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
8417   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
8418   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
8419 };
8420
8421 /* All the information needed to handle a target attribute.
8422    NAME is the name of the attribute.
8423    ATTR_TYPE specifies the type of behavior of the attribute as described
8424    in the definition of enum aarch64_attr_opt_type.
8425    ALLOW_NEG is true if the attribute supports a "no-" form.
8426    HANDLER is the function that takes the attribute string and whether
8427    it is a pragma or attribute and handles the option.  It is needed only
8428    when the ATTR_TYPE is aarch64_attr_custom.
8429    OPT_NUM is the enum specifying the option that the attribute modifies.
8430    This is needed for attributes that mirror the behavior of a command-line
8431    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8432    aarch64_attr_enum.  */
8433
8434 struct aarch64_attribute_info
8435 {
8436   const char *name;
8437   enum aarch64_attr_opt_type attr_type;
8438   bool allow_neg;
8439   bool (*handler) (const char *, const char *);
8440   enum opt_code opt_num;
8441 };
8442
8443 /* Handle the ARCH_STR argument to the arch= target attribute.
8444    PRAGMA_OR_ATTR is used in potential error messages.  */
8445
8446 static bool
8447 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8448 {
8449   const struct processor *tmp_arch = NULL;
8450   enum aarch64_parse_opt_result parse_res
8451     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8452
8453   if (parse_res == AARCH64_PARSE_OK)
8454     {
8455       gcc_assert (tmp_arch);
8456       selected_arch = tmp_arch;
8457       explicit_arch = selected_arch->arch;
8458       return true;
8459     }
8460
8461   switch (parse_res)
8462     {
8463       case AARCH64_PARSE_MISSING_ARG:
8464         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8465         break;
8466       case AARCH64_PARSE_INVALID_ARG:
8467         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8468         break;
8469       case AARCH64_PARSE_INVALID_FEATURE:
8470         error ("invalid feature modifier %qs for 'arch' target %s",
8471                str, pragma_or_attr);
8472         break;
8473       default:
8474         gcc_unreachable ();
8475     }
8476
8477   return false;
8478 }
8479
8480 /* Handle the argument CPU_STR to the cpu= target attribute.
8481    PRAGMA_OR_ATTR is used in potential error messages.  */
8482
8483 static bool
8484 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8485 {
8486   const struct processor *tmp_cpu = NULL;
8487   enum aarch64_parse_opt_result parse_res
8488     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8489
8490   if (parse_res == AARCH64_PARSE_OK)
8491     {
8492       gcc_assert (tmp_cpu);
8493       selected_tune = tmp_cpu;
8494       explicit_tune_core = selected_tune->ident;
8495
8496       selected_arch = &all_architectures[tmp_cpu->arch];
8497       explicit_arch = selected_arch->arch;
8498       return true;
8499     }
8500
8501   switch (parse_res)
8502     {
8503       case AARCH64_PARSE_MISSING_ARG:
8504         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8505         break;
8506       case AARCH64_PARSE_INVALID_ARG:
8507         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8508         break;
8509       case AARCH64_PARSE_INVALID_FEATURE:
8510         error ("invalid feature modifier %qs for 'cpu' target %s",
8511                str, pragma_or_attr);
8512         break;
8513       default:
8514         gcc_unreachable ();
8515     }
8516
8517   return false;
8518 }
8519
8520 /* Handle the argument STR to the tune= target attribute.
8521    PRAGMA_OR_ATTR is used in potential error messages.  */
8522
8523 static bool
8524 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8525 {
8526   const struct processor *tmp_tune = NULL;
8527   enum aarch64_parse_opt_result parse_res
8528     = aarch64_parse_tune (str, &tmp_tune);
8529
8530   if (parse_res == AARCH64_PARSE_OK)
8531     {
8532       gcc_assert (tmp_tune);
8533       selected_tune = tmp_tune;
8534       explicit_tune_core = selected_tune->ident;
8535       return true;
8536     }
8537
8538   switch (parse_res)
8539     {
8540       case AARCH64_PARSE_INVALID_ARG:
8541         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8542         break;
8543       default:
8544         gcc_unreachable ();
8545     }
8546
8547   return false;
8548 }
8549
8550 /* Parse an architecture extensions target attribute string specified in STR.
8551    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
8552    if successful.  Update aarch64_isa_flags to reflect the ISA features
8553    modified.
8554    PRAGMA_OR_ATTR is used in potential error messages.  */
8555
8556 static bool
8557 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8558 {
8559   enum aarch64_parse_opt_result parse_res;
8560   unsigned long isa_flags = aarch64_isa_flags;
8561
8562   /* We allow "+nothing" in the beginning to clear out all architectural
8563      features if the user wants to handpick specific features.  */
8564   if (strncmp ("+nothing", str, 8) == 0)
8565     {
8566       isa_flags = 0;
8567       str += 8;
8568     }
8569
8570   parse_res = aarch64_parse_extension (str, &isa_flags);
8571
8572   if (parse_res == AARCH64_PARSE_OK)
8573     {
8574       aarch64_isa_flags = isa_flags;
8575       return true;
8576     }
8577
8578   switch (parse_res)
8579     {
8580       case AARCH64_PARSE_MISSING_ARG:
8581         error ("missing feature modifier in target %s %qs",
8582                pragma_or_attr, str);
8583         break;
8584
8585       case AARCH64_PARSE_INVALID_FEATURE:
8586         error ("invalid feature modifier in target %s %qs",
8587                pragma_or_attr, str);
8588         break;
8589
8590       default:
8591         gcc_unreachable ();
8592     }
8593
8594  return false;
8595 }
8596
8597 /* The target attributes that we support.  On top of these we also support just
8598    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
8599    handled explicitly in aarch64_process_one_target_attr.  */
8600
8601 static const struct aarch64_attribute_info aarch64_attributes[] =
8602 {
8603   { "general-regs-only", aarch64_attr_mask, false, NULL,
8604      OPT_mgeneral_regs_only },
8605   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8606      OPT_mfix_cortex_a53_835769 },
8607   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
8608      OPT_mfix_cortex_a53_843419 },
8609   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8610   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8611   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8612      OPT_momit_leaf_frame_pointer },
8613   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8614   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
8615      OPT_march_ },
8616   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
8617   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
8618      OPT_mtune_ },
8619   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
8620 };
8621
8622 /* Parse ARG_STR which contains the definition of one target attribute.
8623    Show appropriate errors if any or return true if the attribute is valid.
8624    PRAGMA_OR_ATTR holds the string to use in error messages about whether
8625    we're processing a target attribute or pragma.  */
8626
8627 static bool
8628 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
8629 {
8630   bool invert = false;
8631
8632   size_t len = strlen (arg_str);
8633
8634   if (len == 0)
8635     {
8636       error ("malformed target %s", pragma_or_attr);
8637       return false;
8638     }
8639
8640   char *str_to_check = (char *) alloca (len + 1);
8641   strcpy (str_to_check, arg_str);
8642
8643   /* Skip leading whitespace.  */
8644   while (*str_to_check == ' ' || *str_to_check == '\t')
8645     str_to_check++;
8646
8647   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
8648      It is easier to detect and handle it explicitly here rather than going
8649      through the machinery for the rest of the target attributes in this
8650      function.  */
8651   if (*str_to_check == '+')
8652     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
8653
8654   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
8655     {
8656       invert = true;
8657       str_to_check += 3;
8658     }
8659   char *arg = strchr (str_to_check, '=');
8660
8661   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
8662      and point ARG to "foo".  */
8663   if (arg)
8664     {
8665       *arg = '\0';
8666       arg++;
8667     }
8668   const struct aarch64_attribute_info *p_attr;
8669   bool found = false;
8670   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
8671     {
8672       /* If the names don't match up, or the user has given an argument
8673          to an attribute that doesn't accept one, or didn't give an argument
8674          to an attribute that expects one, fail to match.  */
8675       if (strcmp (str_to_check, p_attr->name) != 0)
8676         continue;
8677
8678       found = true;
8679       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
8680                               || p_attr->attr_type == aarch64_attr_enum;
8681
8682       if (attr_need_arg_p ^ (arg != NULL))
8683         {
8684           error ("target %s %qs does not accept an argument",
8685                   pragma_or_attr, str_to_check);
8686           return false;
8687         }
8688
8689       /* If the name matches but the attribute does not allow "no-" versions
8690          then we can't match.  */
8691       if (invert && !p_attr->allow_neg)
8692         {
8693           error ("target %s %qs does not allow a negated form",
8694                   pragma_or_attr, str_to_check);
8695           return false;
8696         }
8697
8698       switch (p_attr->attr_type)
8699         {
8700         /* Has a custom handler registered.
8701            For example, cpu=, arch=, tune=.  */
8702           case aarch64_attr_custom:
8703             gcc_assert (p_attr->handler);
8704             if (!p_attr->handler (arg, pragma_or_attr))
8705               return false;
8706             break;
8707
8708           /* Either set or unset a boolean option.  */
8709           case aarch64_attr_bool:
8710             {
8711               struct cl_decoded_option decoded;
8712
8713               generate_option (p_attr->opt_num, NULL, !invert,
8714                                CL_TARGET, &decoded);
8715               aarch64_handle_option (&global_options, &global_options_set,
8716                                       &decoded, input_location);
8717               break;
8718             }
8719           /* Set or unset a bit in the target_flags.  aarch64_handle_option
8720              should know what mask to apply given the option number.  */
8721           case aarch64_attr_mask:
8722             {
8723               struct cl_decoded_option decoded;
8724               /* We only need to specify the option number.
8725                  aarch64_handle_option will know which mask to apply.  */
8726               decoded.opt_index = p_attr->opt_num;
8727               decoded.value = !invert;
8728               aarch64_handle_option (&global_options, &global_options_set,
8729                                       &decoded, input_location);
8730               break;
8731             }
8732           /* Use the option setting machinery to set an option to an enum.  */
8733           case aarch64_attr_enum:
8734             {
8735               gcc_assert (arg);
8736               bool valid;
8737               int value;
8738               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
8739                                               &value, CL_TARGET);
8740               if (valid)
8741                 {
8742                   set_option (&global_options, NULL, p_attr->opt_num, value,
8743                               NULL, DK_UNSPECIFIED, input_location,
8744                               global_dc);
8745                 }
8746               else
8747                 {
8748                   error ("target %s %s=%s is not valid",
8749                          pragma_or_attr, str_to_check, arg);
8750                 }
8751               break;
8752             }
8753           default:
8754             gcc_unreachable ();
8755         }
8756     }
8757
8758   /* If we reached here we either have found an attribute and validated
8759      it or didn't match any.  If we matched an attribute but its arguments
8760      were malformed we will have returned false already.  */
8761   return found;
8762 }
8763
8764 /* Count how many times the character C appears in
8765    NULL-terminated string STR.  */
8766
8767 static unsigned int
8768 num_occurences_in_str (char c, char *str)
8769 {
8770   unsigned int res = 0;
8771   while (*str != '\0')
8772     {
8773       if (*str == c)
8774         res++;
8775
8776       str++;
8777     }
8778
8779   return res;
8780 }
8781
8782 /* Parse the tree in ARGS that contains the target attribute information
8783    and update the global target options space.  PRAGMA_OR_ATTR is a string
8784    to be used in error messages, specifying whether this is processing
8785    a target attribute or a target pragma.  */
8786
8787 bool
8788 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
8789 {
8790   if (TREE_CODE (args) == TREE_LIST)
8791     {
8792       do
8793         {
8794           tree head = TREE_VALUE (args);
8795           if (head)
8796             {
8797               if (!aarch64_process_target_attr (head, pragma_or_attr))
8798                 return false;
8799             }
8800           args = TREE_CHAIN (args);
8801         } while (args);
8802
8803       return true;
8804     }
8805   /* We expect to find a string to parse.  */
8806   gcc_assert (TREE_CODE (args) == STRING_CST);
8807
8808   size_t len = strlen (TREE_STRING_POINTER (args));
8809   char *str_to_check = (char *) alloca (len + 1);
8810   strcpy (str_to_check, TREE_STRING_POINTER (args));
8811
8812   if (len == 0)
8813     {
8814       error ("malformed target %s value", pragma_or_attr);
8815       return false;
8816     }
8817
8818   /* Used to catch empty spaces between commas i.e.
8819      attribute ((target ("attr1,,attr2"))).  */
8820   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
8821
8822   /* Handle multiple target attributes separated by ','.  */
8823   char *token = strtok (str_to_check, ",");
8824
8825   unsigned int num_attrs = 0;
8826   while (token)
8827     {
8828       num_attrs++;
8829       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
8830         {
8831           error ("target %s %qs is invalid", pragma_or_attr, token);
8832           return false;
8833         }
8834
8835       token = strtok (NULL, ",");
8836     }
8837
8838   if (num_attrs != num_commas + 1)
8839     {
8840       error ("malformed target %s list %qs",
8841               pragma_or_attr, TREE_STRING_POINTER (args));
8842       return false;
8843     }
8844
8845   return true;
8846 }
8847
8848 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
8849    process attribute ((target ("..."))).  */
8850
8851 static bool
8852 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
8853 {
8854   struct cl_target_option cur_target;
8855   bool ret;
8856   tree old_optimize;
8857   tree new_target, new_optimize;
8858   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
8859
8860   /* If what we're processing is the current pragma string then the
8861      target option node is already stored in target_option_current_node
8862      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
8863      having to re-parse the string.  This is especially useful to keep
8864      arm_neon.h compile times down since that header contains a lot
8865      of intrinsics enclosed in pragmas.  */
8866   if (!existing_target && args == current_target_pragma)
8867     {
8868       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
8869       return true;
8870     }
8871   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
8872
8873   old_optimize = build_optimization_node (&global_options);
8874   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
8875
8876   /* If the function changed the optimization levels as well as setting
8877      target options, start with the optimizations specified.  */
8878   if (func_optimize && func_optimize != old_optimize)
8879     cl_optimization_restore (&global_options,
8880                              TREE_OPTIMIZATION (func_optimize));
8881
8882   /* Save the current target options to restore at the end.  */
8883   cl_target_option_save (&cur_target, &global_options);
8884
8885   /* If fndecl already has some target attributes applied to it, unpack
8886      them so that we add this attribute on top of them, rather than
8887      overwriting them.  */
8888   if (existing_target)
8889     {
8890       struct cl_target_option *existing_options
8891         = TREE_TARGET_OPTION (existing_target);
8892
8893       if (existing_options)
8894         cl_target_option_restore (&global_options, existing_options);
8895     }
8896   else
8897     cl_target_option_restore (&global_options,
8898                         TREE_TARGET_OPTION (target_option_current_node));
8899
8900
8901   ret = aarch64_process_target_attr (args, "attribute");
8902
8903   /* Set up any additional state.  */
8904   if (ret)
8905     {
8906       aarch64_override_options_internal (&global_options);
8907       /* Initialize SIMD builtins if we haven't already.
8908          Set current_target_pragma to NULL for the duration so that
8909          the builtin initialization code doesn't try to tag the functions
8910          being built with the attributes specified by any current pragma, thus
8911          going into an infinite recursion.  */
8912       if (TARGET_SIMD)
8913         {
8914           tree saved_current_target_pragma = current_target_pragma;
8915           current_target_pragma = NULL;
8916           aarch64_init_simd_builtins ();
8917           current_target_pragma = saved_current_target_pragma;
8918         }
8919       new_target = build_target_option_node (&global_options);
8920     }
8921   else
8922     new_target = NULL;
8923
8924   new_optimize = build_optimization_node (&global_options);
8925
8926   if (fndecl && ret)
8927     {
8928       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
8929
8930       if (old_optimize != new_optimize)
8931         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
8932     }
8933
8934   cl_target_option_restore (&global_options, &cur_target);
8935
8936   if (old_optimize != new_optimize)
8937     cl_optimization_restore (&global_options,
8938                              TREE_OPTIMIZATION (old_optimize));
8939   return ret;
8940 }
8941
8942 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
8943    tri-bool options (yes, no, don't care) and the default value is
8944    DEF, determine whether to reject inlining.  */
8945
8946 static bool
8947 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
8948                                      int dont_care, int def)
8949 {
8950   /* If the callee doesn't care, always allow inlining.  */
8951   if (callee == dont_care)
8952     return true;
8953
8954   /* If the caller doesn't care, always allow inlining.  */
8955   if (caller == dont_care)
8956     return true;
8957
8958   /* Otherwise, allow inlining if either the callee and caller values
8959      agree, or if the callee is using the default value.  */
8960   return (callee == caller || callee == def);
8961 }
8962
8963 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
8964    to inline CALLEE into CALLER based on target-specific info.
8965    Make sure that the caller and callee have compatible architectural
8966    features.  Then go through the other possible target attributes
8967    and see if they can block inlining.  Try not to reject always_inline
8968    callees unless they are incompatible architecturally.  */
8969
8970 static bool
8971 aarch64_can_inline_p (tree caller, tree callee)
8972 {
8973   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
8974   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
8975
8976   /* If callee has no option attributes, then it is ok to inline.  */
8977   if (!callee_tree)
8978     return true;
8979
8980   struct cl_target_option *caller_opts
8981         = TREE_TARGET_OPTION (caller_tree ? caller_tree
8982                                            : target_option_default_node);
8983
8984   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
8985
8986
8987   /* Callee's ISA flags should be a subset of the caller's.  */
8988   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
8989        != callee_opts->x_aarch64_isa_flags)
8990     return false;
8991
8992   /* Allow non-strict aligned functions inlining into strict
8993      aligned ones.  */
8994   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
8995        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
8996       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
8997            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
8998     return false;
8999
9000   bool always_inline = lookup_attribute ("always_inline",
9001                                           DECL_ATTRIBUTES (callee));
9002
9003   /* If the architectural features match up and the callee is always_inline
9004      then the other attributes don't matter.  */
9005   if (always_inline)
9006     return true;
9007
9008   if (caller_opts->x_aarch64_cmodel_var
9009       != callee_opts->x_aarch64_cmodel_var)
9010     return false;
9011
9012   if (caller_opts->x_aarch64_tls_dialect
9013       != callee_opts->x_aarch64_tls_dialect)
9014     return false;
9015
9016   /* Honour explicit requests to workaround errata.  */
9017   if (!aarch64_tribools_ok_for_inlining_p (
9018           caller_opts->x_aarch64_fix_a53_err835769,
9019           callee_opts->x_aarch64_fix_a53_err835769,
9020           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9021     return false;
9022
9023   if (!aarch64_tribools_ok_for_inlining_p (
9024           caller_opts->x_aarch64_fix_a53_err843419,
9025           callee_opts->x_aarch64_fix_a53_err843419,
9026           2, TARGET_FIX_ERR_A53_843419))
9027     return false;
9028
9029   /* If the user explicitly specified -momit-leaf-frame-pointer for the
9030      caller and calle and they don't match up, reject inlining.  */
9031   if (!aarch64_tribools_ok_for_inlining_p (
9032           caller_opts->x_flag_omit_leaf_frame_pointer,
9033           callee_opts->x_flag_omit_leaf_frame_pointer,
9034           2, 1))
9035     return false;
9036
9037   /* If the callee has specific tuning overrides, respect them.  */
9038   if (callee_opts->x_aarch64_override_tune_string != NULL
9039       && caller_opts->x_aarch64_override_tune_string == NULL)
9040     return false;
9041
9042   /* If the user specified tuning override strings for the
9043      caller and callee and they don't match up, reject inlining.
9044      We just do a string compare here, we don't analyze the meaning
9045      of the string, as it would be too costly for little gain.  */
9046   if (callee_opts->x_aarch64_override_tune_string
9047       && caller_opts->x_aarch64_override_tune_string
9048       && (strcmp (callee_opts->x_aarch64_override_tune_string,
9049                   caller_opts->x_aarch64_override_tune_string) != 0))
9050     return false;
9051
9052   return true;
9053 }
9054
9055 /* Return true if SYMBOL_REF X binds locally.  */
9056
9057 static bool
9058 aarch64_symbol_binds_local_p (const_rtx x)
9059 {
9060   return (SYMBOL_REF_DECL (x)
9061           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9062           : SYMBOL_REF_LOCAL_P (x));
9063 }
9064
9065 /* Return true if SYMBOL_REF X is thread local */
9066 static bool
9067 aarch64_tls_symbol_p (rtx x)
9068 {
9069   if (! TARGET_HAVE_TLS)
9070     return false;
9071
9072   if (GET_CODE (x) != SYMBOL_REF)
9073     return false;
9074
9075   return SYMBOL_REF_TLS_MODEL (x) != 0;
9076 }
9077
9078 /* Classify a TLS symbol into one of the TLS kinds.  */
9079 enum aarch64_symbol_type
9080 aarch64_classify_tls_symbol (rtx x)
9081 {
9082   enum tls_model tls_kind = tls_symbolic_operand_type (x);
9083
9084   switch (tls_kind)
9085     {
9086     case TLS_MODEL_GLOBAL_DYNAMIC:
9087     case TLS_MODEL_LOCAL_DYNAMIC:
9088       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9089
9090     case TLS_MODEL_INITIAL_EXEC:
9091       switch (aarch64_cmodel)
9092         {
9093         case AARCH64_CMODEL_TINY:
9094         case AARCH64_CMODEL_TINY_PIC:
9095           return SYMBOL_TINY_TLSIE;
9096         default:
9097           return SYMBOL_SMALL_TLSIE;
9098         }
9099
9100     case TLS_MODEL_LOCAL_EXEC:
9101       if (aarch64_tls_size == 12)
9102         return SYMBOL_TLSLE12;
9103       else if (aarch64_tls_size == 24)
9104         return SYMBOL_TLSLE24;
9105       else if (aarch64_tls_size == 32)
9106         return SYMBOL_TLSLE32;
9107       else if (aarch64_tls_size == 48)
9108         return SYMBOL_TLSLE48;
9109       else
9110         gcc_unreachable ();
9111
9112     case TLS_MODEL_EMULATED:
9113     case TLS_MODEL_NONE:
9114       return SYMBOL_FORCE_TO_MEM;
9115
9116     default:
9117       gcc_unreachable ();
9118     }
9119 }
9120
9121 /* Return the method that should be used to access SYMBOL_REF or
9122    LABEL_REF X.  */
9123
9124 enum aarch64_symbol_type
9125 aarch64_classify_symbol (rtx x, rtx offset)
9126 {
9127   if (GET_CODE (x) == LABEL_REF)
9128     {
9129       switch (aarch64_cmodel)
9130         {
9131         case AARCH64_CMODEL_LARGE:
9132           return SYMBOL_FORCE_TO_MEM;
9133
9134         case AARCH64_CMODEL_TINY_PIC:
9135         case AARCH64_CMODEL_TINY:
9136           return SYMBOL_TINY_ABSOLUTE;
9137
9138         case AARCH64_CMODEL_SMALL_SPIC:
9139         case AARCH64_CMODEL_SMALL_PIC:
9140         case AARCH64_CMODEL_SMALL:
9141           return SYMBOL_SMALL_ABSOLUTE;
9142
9143         default:
9144           gcc_unreachable ();
9145         }
9146     }
9147
9148   if (GET_CODE (x) == SYMBOL_REF)
9149     {
9150       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
9151         {
9152           /* This is alright even in PIC code as the constant
9153              pool reference is always PC relative and within
9154              the same translation unit.  */
9155           if (nopcrelative_literal_loads
9156               && CONSTANT_POOL_ADDRESS_P (x))
9157             return SYMBOL_SMALL_ABSOLUTE;
9158           else
9159             return SYMBOL_FORCE_TO_MEM;
9160         }
9161
9162       if (aarch64_tls_symbol_p (x))
9163         return aarch64_classify_tls_symbol (x);
9164
9165       switch (aarch64_cmodel)
9166         {
9167         case AARCH64_CMODEL_TINY:
9168           /* When we retreive symbol + offset address, we have to make sure
9169              the offset does not cause overflow of the final address.  But
9170              we have no way of knowing the address of symbol at compile time
9171              so we can't accurately say if the distance between the PC and
9172              symbol + offset is outside the addressible range of +/-1M in the
9173              TINY code model.  So we rely on images not being greater than
9174              1M and cap the offset at 1M and anything beyond 1M will have to
9175              be loaded using an alternative mechanism.  */
9176           if (SYMBOL_REF_WEAK (x)
9177               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9178             return SYMBOL_FORCE_TO_MEM;
9179           return SYMBOL_TINY_ABSOLUTE;
9180
9181         case AARCH64_CMODEL_SMALL:
9182           /* Same reasoning as the tiny code model, but the offset cap here is
9183              4G.  */
9184           if (SYMBOL_REF_WEAK (x)
9185               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9186                             HOST_WIDE_INT_C (4294967264)))
9187             return SYMBOL_FORCE_TO_MEM;
9188           return SYMBOL_SMALL_ABSOLUTE;
9189
9190         case AARCH64_CMODEL_TINY_PIC:
9191           if (!aarch64_symbol_binds_local_p (x))
9192             return SYMBOL_TINY_GOT;
9193           return SYMBOL_TINY_ABSOLUTE;
9194
9195         case AARCH64_CMODEL_SMALL_SPIC:
9196         case AARCH64_CMODEL_SMALL_PIC:
9197           if (!aarch64_symbol_binds_local_p (x))
9198             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9199                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9200           return SYMBOL_SMALL_ABSOLUTE;
9201
9202         default:
9203           gcc_unreachable ();
9204         }
9205     }
9206
9207   /* By default push everything into the constant pool.  */
9208   return SYMBOL_FORCE_TO_MEM;
9209 }
9210
9211 bool
9212 aarch64_constant_address_p (rtx x)
9213 {
9214   return (CONSTANT_P (x) && memory_address_p (DImode, x));
9215 }
9216
9217 bool
9218 aarch64_legitimate_pic_operand_p (rtx x)
9219 {
9220   if (GET_CODE (x) == SYMBOL_REF
9221       || (GET_CODE (x) == CONST
9222           && GET_CODE (XEXP (x, 0)) == PLUS
9223           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9224      return false;
9225
9226   return true;
9227 }
9228
9229 /* Return true if X holds either a quarter-precision or
9230      floating-point +0.0 constant.  */
9231 static bool
9232 aarch64_valid_floating_const (machine_mode mode, rtx x)
9233 {
9234   if (!CONST_DOUBLE_P (x))
9235     return false;
9236
9237   if (aarch64_float_const_zero_rtx_p (x))
9238     return true;
9239
9240   /* We only handle moving 0.0 to a TFmode register.  */
9241   if (!(mode == SFmode || mode == DFmode))
9242     return false;
9243
9244   return aarch64_float_const_representable_p (x);
9245 }
9246
9247 static bool
9248 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9249 {
9250   /* Do not allow vector struct mode constants.  We could support
9251      0 and -1 easily, but they need support in aarch64-simd.md.  */
9252   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9253     return false;
9254
9255   /* This could probably go away because
9256      we now decompose CONST_INTs according to expand_mov_immediate.  */
9257   if ((GET_CODE (x) == CONST_VECTOR
9258        && aarch64_simd_valid_immediate (x, mode, false, NULL))
9259       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9260         return !targetm.cannot_force_const_mem (mode, x);
9261
9262   if (GET_CODE (x) == HIGH
9263       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9264     return true;
9265
9266   return aarch64_constant_address_p (x);
9267 }
9268
9269 rtx
9270 aarch64_load_tp (rtx target)
9271 {
9272   if (!target
9273       || GET_MODE (target) != Pmode
9274       || !register_operand (target, Pmode))
9275     target = gen_reg_rtx (Pmode);
9276
9277   /* Can return in any reg.  */
9278   emit_insn (gen_aarch64_load_tp_hard (target));
9279   return target;
9280 }
9281
9282 /* On AAPCS systems, this is the "struct __va_list".  */
9283 static GTY(()) tree va_list_type;
9284
9285 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9286    Return the type to use as __builtin_va_list.
9287
9288    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9289
9290    struct __va_list
9291    {
9292      void *__stack;
9293      void *__gr_top;
9294      void *__vr_top;
9295      int   __gr_offs;
9296      int   __vr_offs;
9297    };  */
9298
9299 static tree
9300 aarch64_build_builtin_va_list (void)
9301 {
9302   tree va_list_name;
9303   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9304
9305   /* Create the type.  */
9306   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9307   /* Give it the required name.  */
9308   va_list_name = build_decl (BUILTINS_LOCATION,
9309                              TYPE_DECL,
9310                              get_identifier ("__va_list"),
9311                              va_list_type);
9312   DECL_ARTIFICIAL (va_list_name) = 1;
9313   TYPE_NAME (va_list_type) = va_list_name;
9314   TYPE_STUB_DECL (va_list_type) = va_list_name;
9315
9316   /* Create the fields.  */
9317   f_stack = build_decl (BUILTINS_LOCATION,
9318                         FIELD_DECL, get_identifier ("__stack"),
9319                         ptr_type_node);
9320   f_grtop = build_decl (BUILTINS_LOCATION,
9321                         FIELD_DECL, get_identifier ("__gr_top"),
9322                         ptr_type_node);
9323   f_vrtop = build_decl (BUILTINS_LOCATION,
9324                         FIELD_DECL, get_identifier ("__vr_top"),
9325                         ptr_type_node);
9326   f_groff = build_decl (BUILTINS_LOCATION,
9327                         FIELD_DECL, get_identifier ("__gr_offs"),
9328                         integer_type_node);
9329   f_vroff = build_decl (BUILTINS_LOCATION,
9330                         FIELD_DECL, get_identifier ("__vr_offs"),
9331                         integer_type_node);
9332
9333   /* Tell tree-stdarg pass about our internal offset fields.
9334      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
9335      purpose to identify whether the code is updating va_list internal
9336      offset fields through irregular way.  */
9337   va_list_gpr_counter_field = f_groff;
9338   va_list_fpr_counter_field = f_vroff;
9339
9340   DECL_ARTIFICIAL (f_stack) = 1;
9341   DECL_ARTIFICIAL (f_grtop) = 1;
9342   DECL_ARTIFICIAL (f_vrtop) = 1;
9343   DECL_ARTIFICIAL (f_groff) = 1;
9344   DECL_ARTIFICIAL (f_vroff) = 1;
9345
9346   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9347   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9348   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9349   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9350   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9351
9352   TYPE_FIELDS (va_list_type) = f_stack;
9353   DECL_CHAIN (f_stack) = f_grtop;
9354   DECL_CHAIN (f_grtop) = f_vrtop;
9355   DECL_CHAIN (f_vrtop) = f_groff;
9356   DECL_CHAIN (f_groff) = f_vroff;
9357
9358   /* Compute its layout.  */
9359   layout_type (va_list_type);
9360
9361   return va_list_type;
9362 }
9363
9364 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
9365 static void
9366 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
9367 {
9368   const CUMULATIVE_ARGS *cum;
9369   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9370   tree stack, grtop, vrtop, groff, vroff;
9371   tree t;
9372   int gr_save_area_size = cfun->va_list_gpr_size;
9373   int vr_save_area_size = cfun->va_list_fpr_size;
9374   int vr_offset;
9375
9376   cum = &crtl->args.info;
9377   if (cfun->va_list_gpr_size)
9378     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
9379                              cfun->va_list_gpr_size);
9380   if (cfun->va_list_fpr_size)
9381     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
9382                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
9383
9384   if (!TARGET_FLOAT)
9385     {
9386       gcc_assert (cum->aapcs_nvrn == 0);
9387       vr_save_area_size = 0;
9388     }
9389
9390   f_stack = TYPE_FIELDS (va_list_type_node);
9391   f_grtop = DECL_CHAIN (f_stack);
9392   f_vrtop = DECL_CHAIN (f_grtop);
9393   f_groff = DECL_CHAIN (f_vrtop);
9394   f_vroff = DECL_CHAIN (f_groff);
9395
9396   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
9397                   NULL_TREE);
9398   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
9399                   NULL_TREE);
9400   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
9401                   NULL_TREE);
9402   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
9403                   NULL_TREE);
9404   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
9405                   NULL_TREE);
9406
9407   /* Emit code to initialize STACK, which points to the next varargs stack
9408      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
9409      by named arguments.  STACK is 8-byte aligned.  */
9410   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
9411   if (cum->aapcs_stack_size > 0)
9412     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
9413   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
9414   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9415
9416   /* Emit code to initialize GRTOP, the top of the GR save area.
9417      virtual_incoming_args_rtx should have been 16 byte aligned.  */
9418   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
9419   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
9420   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9421
9422   /* Emit code to initialize VRTOP, the top of the VR save area.
9423      This address is gr_save_area_bytes below GRTOP, rounded
9424      down to the next 16-byte boundary.  */
9425   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
9426   vr_offset = ROUND_UP (gr_save_area_size,
9427                         STACK_BOUNDARY / BITS_PER_UNIT);
9428
9429   if (vr_offset)
9430     t = fold_build_pointer_plus_hwi (t, -vr_offset);
9431   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
9432   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9433
9434   /* Emit code to initialize GROFF, the offset from GRTOP of the
9435      next GPR argument.  */
9436   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
9437               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
9438   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9439
9440   /* Likewise emit code to initialize VROFF, the offset from FTOP
9441      of the next VR argument.  */
9442   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
9443               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
9444   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9445 }
9446
9447 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
9448
9449 static tree
9450 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
9451                               gimple_seq *post_p ATTRIBUTE_UNUSED)
9452 {
9453   tree addr;
9454   bool indirect_p;
9455   bool is_ha;           /* is HFA or HVA.  */
9456   bool dw_align;        /* double-word align.  */
9457   machine_mode ag_mode = VOIDmode;
9458   int nregs;
9459   machine_mode mode;
9460
9461   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9462   tree stack, f_top, f_off, off, arg, roundup, on_stack;
9463   HOST_WIDE_INT size, rsize, adjust, align;
9464   tree t, u, cond1, cond2;
9465
9466   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9467   if (indirect_p)
9468     type = build_pointer_type (type);
9469
9470   mode = TYPE_MODE (type);
9471
9472   f_stack = TYPE_FIELDS (va_list_type_node);
9473   f_grtop = DECL_CHAIN (f_stack);
9474   f_vrtop = DECL_CHAIN (f_grtop);
9475   f_groff = DECL_CHAIN (f_vrtop);
9476   f_vroff = DECL_CHAIN (f_groff);
9477
9478   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
9479                   f_stack, NULL_TREE);
9480   size = int_size_in_bytes (type);
9481   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9482
9483   dw_align = false;
9484   adjust = 0;
9485   if (aarch64_vfp_is_call_or_return_candidate (mode,
9486                                                type,
9487                                                &ag_mode,
9488                                                &nregs,
9489                                                &is_ha))
9490     {
9491       /* TYPE passed in fp/simd registers.  */
9492       if (!TARGET_FLOAT)
9493         aarch64_err_no_fpadvsimd (mode, "varargs");
9494
9495       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9496                       unshare_expr (valist), f_vrtop, NULL_TREE);
9497       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9498                       unshare_expr (valist), f_vroff, NULL_TREE);
9499
9500       rsize = nregs * UNITS_PER_VREG;
9501
9502       if (is_ha)
9503         {
9504           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9505             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9506         }
9507       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9508                && size < UNITS_PER_VREG)
9509         {
9510           adjust = UNITS_PER_VREG - size;
9511         }
9512     }
9513   else
9514     {
9515       /* TYPE passed in general registers.  */
9516       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9517                       unshare_expr (valist), f_grtop, NULL_TREE);
9518       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9519                       unshare_expr (valist), f_groff, NULL_TREE);
9520       rsize = ROUND_UP (size, UNITS_PER_WORD);
9521       nregs = rsize / UNITS_PER_WORD;
9522
9523       if (align > 8)
9524         dw_align = true;
9525
9526       if (BLOCK_REG_PADDING (mode, type, 1) == downward
9527           && size < UNITS_PER_WORD)
9528         {
9529           adjust = UNITS_PER_WORD  - size;
9530         }
9531     }
9532
9533   /* Get a local temporary for the field value.  */
9534   off = get_initialized_tmp_var (f_off, pre_p, NULL);
9535
9536   /* Emit code to branch if off >= 0.  */
9537   t = build2 (GE_EXPR, boolean_type_node, off,
9538               build_int_cst (TREE_TYPE (off), 0));
9539   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9540
9541   if (dw_align)
9542     {
9543       /* Emit: offs = (offs + 15) & -16.  */
9544       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9545                   build_int_cst (TREE_TYPE (off), 15));
9546       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9547                   build_int_cst (TREE_TYPE (off), -16));
9548       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9549     }
9550   else
9551     roundup = NULL;
9552
9553   /* Update ap.__[g|v]r_offs  */
9554   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9555               build_int_cst (TREE_TYPE (off), rsize));
9556   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9557
9558   /* String up.  */
9559   if (roundup)
9560     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9561
9562   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
9563   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9564               build_int_cst (TREE_TYPE (f_off), 0));
9565   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9566
9567   /* String up: make sure the assignment happens before the use.  */
9568   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9569   COND_EXPR_ELSE (cond1) = t;
9570
9571   /* Prepare the trees handling the argument that is passed on the stack;
9572      the top level node will store in ON_STACK.  */
9573   arg = get_initialized_tmp_var (stack, pre_p, NULL);
9574   if (align > 8)
9575     {
9576       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
9577       t = fold_convert (intDI_type_node, arg);
9578       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9579                   build_int_cst (TREE_TYPE (t), 15));
9580       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9581                   build_int_cst (TREE_TYPE (t), -16));
9582       t = fold_convert (TREE_TYPE (arg), t);
9583       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9584     }
9585   else
9586     roundup = NULL;
9587   /* Advance ap.__stack  */
9588   t = fold_convert (intDI_type_node, arg);
9589   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9590               build_int_cst (TREE_TYPE (t), size + 7));
9591   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9592               build_int_cst (TREE_TYPE (t), -8));
9593   t = fold_convert (TREE_TYPE (arg), t);
9594   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9595   /* String up roundup and advance.  */
9596   if (roundup)
9597     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9598   /* String up with arg */
9599   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9600   /* Big-endianness related address adjustment.  */
9601   if (BLOCK_REG_PADDING (mode, type, 1) == downward
9602       && size < UNITS_PER_WORD)
9603   {
9604     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9605                 size_int (UNITS_PER_WORD - size));
9606     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9607   }
9608
9609   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9610   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9611
9612   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
9613   t = off;
9614   if (adjust)
9615     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
9616                 build_int_cst (TREE_TYPE (off), adjust));
9617
9618   t = fold_convert (sizetype, t);
9619   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
9620
9621   if (is_ha)
9622     {
9623       /* type ha; // treat as "struct {ftype field[n];}"
9624          ... [computing offs]
9625          for (i = 0; i <nregs; ++i, offs += 16)
9626            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
9627          return ha;  */
9628       int i;
9629       tree tmp_ha, field_t, field_ptr_t;
9630
9631       /* Declare a local variable.  */
9632       tmp_ha = create_tmp_var_raw (type, "ha");
9633       gimple_add_tmp_var (tmp_ha);
9634
9635       /* Establish the base type.  */
9636       switch (ag_mode)
9637         {
9638         case SFmode:
9639           field_t = float_type_node;
9640           field_ptr_t = float_ptr_type_node;
9641           break;
9642         case DFmode:
9643           field_t = double_type_node;
9644           field_ptr_t = double_ptr_type_node;
9645           break;
9646         case TFmode:
9647           field_t = long_double_type_node;
9648           field_ptr_t = long_double_ptr_type_node;
9649           break;
9650 /* The half precision and quad precision are not fully supported yet.  Enable
9651    the following code after the support is complete.  Need to find the correct
9652    type node for __fp16 *.  */
9653 #if 0
9654         case HFmode:
9655           field_t = float_type_node;
9656           field_ptr_t = float_ptr_type_node;
9657           break;
9658 #endif
9659         case V2SImode:
9660         case V4SImode:
9661             {
9662               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
9663               field_t = build_vector_type_for_mode (innertype, ag_mode);
9664               field_ptr_t = build_pointer_type (field_t);
9665             }
9666           break;
9667         default:
9668           gcc_assert (0);
9669         }
9670
9671       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
9672       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
9673       addr = t;
9674       t = fold_convert (field_ptr_t, addr);
9675       t = build2 (MODIFY_EXPR, field_t,
9676                   build1 (INDIRECT_REF, field_t, tmp_ha),
9677                   build1 (INDIRECT_REF, field_t, t));
9678
9679       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
9680       for (i = 1; i < nregs; ++i)
9681         {
9682           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
9683           u = fold_convert (field_ptr_t, addr);
9684           u = build2 (MODIFY_EXPR, field_t,
9685                       build2 (MEM_REF, field_t, tmp_ha,
9686                               build_int_cst (field_ptr_t,
9687                                              (i *
9688                                               int_size_in_bytes (field_t)))),
9689                       build1 (INDIRECT_REF, field_t, u));
9690           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
9691         }
9692
9693       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
9694       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
9695     }
9696
9697   COND_EXPR_ELSE (cond2) = t;
9698   addr = fold_convert (build_pointer_type (type), cond1);
9699   addr = build_va_arg_indirect_ref (addr);
9700
9701   if (indirect_p)
9702     addr = build_va_arg_indirect_ref (addr);
9703
9704   return addr;
9705 }
9706
9707 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
9708
9709 static void
9710 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9711                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
9712                                 int no_rtl)
9713 {
9714   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9715   CUMULATIVE_ARGS local_cum;
9716   int gr_saved = cfun->va_list_gpr_size;
9717   int vr_saved = cfun->va_list_fpr_size;
9718
9719   /* The caller has advanced CUM up to, but not beyond, the last named
9720      argument.  Advance a local copy of CUM past the last "real" named
9721      argument, to find out how many registers are left over.  */
9722   local_cum = *cum;
9723   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
9724
9725   /* Found out how many registers we need to save.
9726      Honor tree-stdvar analysis results.  */
9727   if (cfun->va_list_gpr_size)
9728     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
9729                     cfun->va_list_gpr_size / UNITS_PER_WORD);
9730   if (cfun->va_list_fpr_size)
9731     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
9732                     cfun->va_list_fpr_size / UNITS_PER_VREG);
9733
9734   if (!TARGET_FLOAT)
9735     {
9736       gcc_assert (local_cum.aapcs_nvrn == 0);
9737       vr_saved = 0;
9738     }
9739
9740   if (!no_rtl)
9741     {
9742       if (gr_saved > 0)
9743         {
9744           rtx ptr, mem;
9745
9746           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
9747           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
9748                                - gr_saved * UNITS_PER_WORD);
9749           mem = gen_frame_mem (BLKmode, ptr);
9750           set_mem_alias_set (mem, get_varargs_alias_set ());
9751
9752           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
9753                                mem, gr_saved);
9754         }
9755       if (vr_saved > 0)
9756         {
9757           /* We can't use move_block_from_reg, because it will use
9758              the wrong mode, storing D regs only.  */
9759           machine_mode mode = TImode;
9760           int off, i, vr_start;
9761
9762           /* Set OFF to the offset from virtual_incoming_args_rtx of
9763              the first vector register.  The VR save area lies below
9764              the GR one, and is aligned to 16 bytes.  */
9765           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
9766                            STACK_BOUNDARY / BITS_PER_UNIT);
9767           off -= vr_saved * UNITS_PER_VREG;
9768
9769           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
9770           for (i = 0; i < vr_saved; ++i)
9771             {
9772               rtx ptr, mem;
9773
9774               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
9775               mem = gen_frame_mem (mode, ptr);
9776               set_mem_alias_set (mem, get_varargs_alias_set ());
9777               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
9778               off += UNITS_PER_VREG;
9779             }
9780         }
9781     }
9782
9783   /* We don't save the size into *PRETEND_SIZE because we want to avoid
9784      any complication of having crtl->args.pretend_args_size changed.  */
9785   cfun->machine->frame.saved_varargs_size
9786     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
9787                  STACK_BOUNDARY / BITS_PER_UNIT)
9788        + vr_saved * UNITS_PER_VREG);
9789 }
9790
9791 static void
9792 aarch64_conditional_register_usage (void)
9793 {
9794   int i;
9795   if (!TARGET_FLOAT)
9796     {
9797       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
9798         {
9799           fixed_regs[i] = 1;
9800           call_used_regs[i] = 1;
9801         }
9802     }
9803 }
9804
9805 /* Walk down the type tree of TYPE counting consecutive base elements.
9806    If *MODEP is VOIDmode, then set it to the first valid floating point
9807    type.  If a non-floating point type is found, or if a floating point
9808    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
9809    otherwise return the count in the sub-tree.  */
9810 static int
9811 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
9812 {
9813   machine_mode mode;
9814   HOST_WIDE_INT size;
9815
9816   switch (TREE_CODE (type))
9817     {
9818     case REAL_TYPE:
9819       mode = TYPE_MODE (type);
9820       if (mode != DFmode && mode != SFmode && mode != TFmode)
9821         return -1;
9822
9823       if (*modep == VOIDmode)
9824         *modep = mode;
9825
9826       if (*modep == mode)
9827         return 1;
9828
9829       break;
9830
9831     case COMPLEX_TYPE:
9832       mode = TYPE_MODE (TREE_TYPE (type));
9833       if (mode != DFmode && mode != SFmode && mode != TFmode)
9834         return -1;
9835
9836       if (*modep == VOIDmode)
9837         *modep = mode;
9838
9839       if (*modep == mode)
9840         return 2;
9841
9842       break;
9843
9844     case VECTOR_TYPE:
9845       /* Use V2SImode and V4SImode as representatives of all 64-bit
9846          and 128-bit vector types.  */
9847       size = int_size_in_bytes (type);
9848       switch (size)
9849         {
9850         case 8:
9851           mode = V2SImode;
9852           break;
9853         case 16:
9854           mode = V4SImode;
9855           break;
9856         default:
9857           return -1;
9858         }
9859
9860       if (*modep == VOIDmode)
9861         *modep = mode;
9862
9863       /* Vector modes are considered to be opaque: two vectors are
9864          equivalent for the purposes of being homogeneous aggregates
9865          if they are the same size.  */
9866       if (*modep == mode)
9867         return 1;
9868
9869       break;
9870
9871     case ARRAY_TYPE:
9872       {
9873         int count;
9874         tree index = TYPE_DOMAIN (type);
9875
9876         /* Can't handle incomplete types nor sizes that are not
9877            fixed.  */
9878         if (!COMPLETE_TYPE_P (type)
9879             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
9880           return -1;
9881
9882         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
9883         if (count == -1
9884             || !index
9885             || !TYPE_MAX_VALUE (index)
9886             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
9887             || !TYPE_MIN_VALUE (index)
9888             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
9889             || count < 0)
9890           return -1;
9891
9892         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
9893                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
9894
9895         /* There must be no padding.  */
9896         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
9897           return -1;
9898
9899         return count;
9900       }
9901
9902     case RECORD_TYPE:
9903       {
9904         int count = 0;
9905         int sub_count;
9906         tree field;
9907
9908         /* Can't handle incomplete types nor sizes that are not
9909            fixed.  */
9910         if (!COMPLETE_TYPE_P (type)
9911             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
9912           return -1;
9913
9914         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
9915           {
9916             if (TREE_CODE (field) != FIELD_DECL)
9917               continue;
9918
9919             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
9920             if (sub_count < 0)
9921               return -1;
9922             count += sub_count;
9923           }
9924
9925         /* There must be no padding.  */
9926         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
9927           return -1;
9928
9929         return count;
9930       }
9931
9932     case UNION_TYPE:
9933     case QUAL_UNION_TYPE:
9934       {
9935         /* These aren't very interesting except in a degenerate case.  */
9936         int count = 0;
9937         int sub_count;
9938         tree field;
9939
9940         /* Can't handle incomplete types nor sizes that are not
9941            fixed.  */
9942         if (!COMPLETE_TYPE_P (type)
9943             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
9944           return -1;
9945
9946         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
9947           {
9948             if (TREE_CODE (field) != FIELD_DECL)
9949               continue;
9950
9951             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
9952             if (sub_count < 0)
9953               return -1;
9954             count = count > sub_count ? count : sub_count;
9955           }
9956
9957         /* There must be no padding.  */
9958         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
9959           return -1;
9960
9961         return count;
9962       }
9963
9964     default:
9965       break;
9966     }
9967
9968   return -1;
9969 }
9970
9971 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
9972    type as described in AAPCS64 \S 4.1.2.
9973
9974    See the comment above aarch64_composite_type_p for the notes on MODE.  */
9975
9976 static bool
9977 aarch64_short_vector_p (const_tree type,
9978                         machine_mode mode)
9979 {
9980   HOST_WIDE_INT size = -1;
9981
9982   if (type && TREE_CODE (type) == VECTOR_TYPE)
9983     size = int_size_in_bytes (type);
9984   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
9985             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
9986     size = GET_MODE_SIZE (mode);
9987
9988   return (size == 8 || size == 16);
9989 }
9990
9991 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
9992    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
9993    array types.  The C99 floating-point complex types are also considered
9994    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
9995    types, which are GCC extensions and out of the scope of AAPCS64, are
9996    treated as composite types here as well.
9997
9998    Note that MODE itself is not sufficient in determining whether a type
9999    is such a composite type or not.  This is because
10000    stor-layout.c:compute_record_mode may have already changed the MODE
10001    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
10002    structure with only one field may have its MODE set to the mode of the
10003    field.  Also an integer mode whose size matches the size of the
10004    RECORD_TYPE type may be used to substitute the original mode
10005    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
10006    solely relied on.  */
10007
10008 static bool
10009 aarch64_composite_type_p (const_tree type,
10010                           machine_mode mode)
10011 {
10012   if (aarch64_short_vector_p (type, mode))
10013     return false;
10014
10015   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10016     return true;
10017
10018   if (mode == BLKmode
10019       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10020       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10021     return true;
10022
10023   return false;
10024 }
10025
10026 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10027    shall be passed or returned in simd/fp register(s) (providing these
10028    parameter passing registers are available).
10029
10030    Upon successful return, *COUNT returns the number of needed registers,
10031    *BASE_MODE returns the mode of the individual register and when IS_HAF
10032    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10033    floating-point aggregate or a homogeneous short-vector aggregate.  */
10034
10035 static bool
10036 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10037                                          const_tree type,
10038                                          machine_mode *base_mode,
10039                                          int *count,
10040                                          bool *is_ha)
10041 {
10042   machine_mode new_mode = VOIDmode;
10043   bool composite_p = aarch64_composite_type_p (type, mode);
10044
10045   if (is_ha != NULL) *is_ha = false;
10046
10047   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10048       || aarch64_short_vector_p (type, mode))
10049     {
10050       *count = 1;
10051       new_mode = mode;
10052     }
10053   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10054     {
10055       if (is_ha != NULL) *is_ha = true;
10056       *count = 2;
10057       new_mode = GET_MODE_INNER (mode);
10058     }
10059   else if (type && composite_p)
10060     {
10061       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10062
10063       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10064         {
10065           if (is_ha != NULL) *is_ha = true;
10066           *count = ag_count;
10067         }
10068       else
10069         return false;
10070     }
10071   else
10072     return false;
10073
10074   *base_mode = new_mode;
10075   return true;
10076 }
10077
10078 /* Implement TARGET_STRUCT_VALUE_RTX.  */
10079
10080 static rtx
10081 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10082                           int incoming ATTRIBUTE_UNUSED)
10083 {
10084   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10085 }
10086
10087 /* Implements target hook vector_mode_supported_p.  */
10088 static bool
10089 aarch64_vector_mode_supported_p (machine_mode mode)
10090 {
10091   if (TARGET_SIMD
10092       && (mode == V4SImode  || mode == V8HImode
10093           || mode == V16QImode || mode == V2DImode
10094           || mode == V2SImode  || mode == V4HImode
10095           || mode == V8QImode || mode == V2SFmode
10096           || mode == V4SFmode || mode == V2DFmode
10097           || mode == V4HFmode || mode == V8HFmode
10098           || mode == V1DFmode))
10099     return true;
10100
10101   return false;
10102 }
10103
10104 /* Return appropriate SIMD container
10105    for MODE within a vector of WIDTH bits.  */
10106 static machine_mode
10107 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10108 {
10109   gcc_assert (width == 64 || width == 128);
10110   if (TARGET_SIMD)
10111     {
10112       if (width == 128)
10113         switch (mode)
10114           {
10115           case DFmode:
10116             return V2DFmode;
10117           case SFmode:
10118             return V4SFmode;
10119           case SImode:
10120             return V4SImode;
10121           case HImode:
10122             return V8HImode;
10123           case QImode:
10124             return V16QImode;
10125           case DImode:
10126             return V2DImode;
10127           default:
10128             break;
10129           }
10130       else
10131         switch (mode)
10132           {
10133           case SFmode:
10134             return V2SFmode;
10135           case SImode:
10136             return V2SImode;
10137           case HImode:
10138             return V4HImode;
10139           case QImode:
10140             return V8QImode;
10141           default:
10142             break;
10143           }
10144     }
10145   return word_mode;
10146 }
10147
10148 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
10149 static machine_mode
10150 aarch64_preferred_simd_mode (machine_mode mode)
10151 {
10152   return aarch64_simd_container_mode (mode, 128);
10153 }
10154
10155 /* Return the bitmask of possible vector sizes for the vectorizer
10156    to iterate over.  */
10157 static unsigned int
10158 aarch64_autovectorize_vector_sizes (void)
10159 {
10160   return (16 | 8);
10161 }
10162
10163 /* Implement TARGET_MANGLE_TYPE.  */
10164
10165 static const char *
10166 aarch64_mangle_type (const_tree type)
10167 {
10168   /* The AArch64 ABI documents say that "__va_list" has to be
10169      managled as if it is in the "std" namespace.  */
10170   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10171     return "St9__va_list";
10172
10173   /* Half-precision float.  */
10174   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10175     return "Dh";
10176
10177   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
10178      builtin types.  */
10179   if (TYPE_NAME (type) != NULL)
10180     return aarch64_mangle_builtin_type (type);
10181
10182   /* Use the default mangling.  */
10183   return NULL;
10184 }
10185
10186
10187 /* Return true if the rtx_insn contains a MEM RTX somewhere
10188    in it.  */
10189
10190 static bool
10191 has_memory_op (rtx_insn *mem_insn)
10192 {
10193   subrtx_iterator::array_type array;
10194   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10195     if (MEM_P (*iter))
10196       return true;
10197
10198   return false;
10199 }
10200
10201 /* Find the first rtx_insn before insn that will generate an assembly
10202    instruction.  */
10203
10204 static rtx_insn *
10205 aarch64_prev_real_insn (rtx_insn *insn)
10206 {
10207   if (!insn)
10208     return NULL;
10209
10210   do
10211     {
10212       insn = prev_real_insn (insn);
10213     }
10214   while (insn && recog_memoized (insn) < 0);
10215
10216   return insn;
10217 }
10218
10219 static bool
10220 is_madd_op (enum attr_type t1)
10221 {
10222   unsigned int i;
10223   /* A number of these may be AArch32 only.  */
10224   enum attr_type mlatypes[] = {
10225     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10226     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10227     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10228   };
10229
10230   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10231     {
10232       if (t1 == mlatypes[i])
10233         return true;
10234     }
10235
10236   return false;
10237 }
10238
10239 /* Check if there is a register dependency between a load and the insn
10240    for which we hold recog_data.  */
10241
10242 static bool
10243 dep_between_memop_and_curr (rtx memop)
10244 {
10245   rtx load_reg;
10246   int opno;
10247
10248   gcc_assert (GET_CODE (memop) == SET);
10249
10250   if (!REG_P (SET_DEST (memop)))
10251     return false;
10252
10253   load_reg = SET_DEST (memop);
10254   for (opno = 1; opno < recog_data.n_operands; opno++)
10255     {
10256       rtx operand = recog_data.operand[opno];
10257       if (REG_P (operand)
10258           && reg_overlap_mentioned_p (load_reg, operand))
10259         return true;
10260
10261     }
10262   return false;
10263 }
10264
10265
10266 /* When working around the Cortex-A53 erratum 835769,
10267    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10268    instruction and has a preceding memory instruction such that a NOP
10269    should be inserted between them.  */
10270
10271 bool
10272 aarch64_madd_needs_nop (rtx_insn* insn)
10273 {
10274   enum attr_type attr_type;
10275   rtx_insn *prev;
10276   rtx body;
10277
10278   if (!TARGET_FIX_ERR_A53_835769)
10279     return false;
10280
10281   if (!INSN_P (insn) || recog_memoized (insn) < 0)
10282     return false;
10283
10284   attr_type = get_attr_type (insn);
10285   if (!is_madd_op (attr_type))
10286     return false;
10287
10288   prev = aarch64_prev_real_insn (insn);
10289   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10290      Restore recog state to INSN to avoid state corruption.  */
10291   extract_constrain_insn_cached (insn);
10292
10293   if (!prev || !has_memory_op (prev))
10294     return false;
10295
10296   body = single_set (prev);
10297
10298   /* If the previous insn is a memory op and there is no dependency between
10299      it and the DImode madd, emit a NOP between them.  If body is NULL then we
10300      have a complex memory operation, probably a load/store pair.
10301      Be conservative for now and emit a NOP.  */
10302   if (GET_MODE (recog_data.operand[0]) == DImode
10303       && (!body || !dep_between_memop_and_curr (body)))
10304     return true;
10305
10306   return false;
10307
10308 }
10309
10310
10311 /* Implement FINAL_PRESCAN_INSN.  */
10312
10313 void
10314 aarch64_final_prescan_insn (rtx_insn *insn)
10315 {
10316   if (aarch64_madd_needs_nop (insn))
10317     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10318 }
10319
10320
10321 /* Return the equivalent letter for size.  */
10322 static char
10323 sizetochar (int size)
10324 {
10325   switch (size)
10326     {
10327     case 64: return 'd';
10328     case 32: return 's';
10329     case 16: return 'h';
10330     case 8 : return 'b';
10331     default: gcc_unreachable ();
10332     }
10333 }
10334
10335 /* Return true iff x is a uniform vector of floating-point
10336    constants, and the constant can be represented in
10337    quarter-precision form.  Note, as aarch64_float_const_representable
10338    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
10339 static bool
10340 aarch64_vect_float_const_representable_p (rtx x)
10341 {
10342   rtx elt;
10343   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10344           && const_vec_duplicate_p (x, &elt)
10345           && aarch64_float_const_representable_p (elt));
10346 }
10347
10348 /* Return true for valid and false for invalid.  */
10349 bool
10350 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10351                               struct simd_immediate_info *info)
10352 {
10353 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
10354   matches = 1;                                          \
10355   for (i = 0; i < idx; i += (STRIDE))                   \
10356     if (!(TEST))                                        \
10357       matches = 0;                                      \
10358   if (matches)                                          \
10359     {                                                   \
10360       immtype = (CLASS);                                \
10361       elsize = (ELSIZE);                                \
10362       eshift = (SHIFT);                                 \
10363       emvn = (NEG);                                     \
10364       break;                                            \
10365     }
10366
10367   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
10368   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
10369   unsigned char bytes[16];
10370   int immtype = -1, matches;
10371   unsigned int invmask = inverse ? 0xff : 0;
10372   int eshift, emvn;
10373
10374   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10375     {
10376       if (! (aarch64_simd_imm_zero_p (op, mode)
10377              || aarch64_vect_float_const_representable_p (op)))
10378         return false;
10379
10380       if (info)
10381         {
10382           info->value = CONST_VECTOR_ELT (op, 0);
10383           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
10384           info->mvn = false;
10385           info->shift = 0;
10386         }
10387
10388       return true;
10389     }
10390
10391   /* Splat vector constant out into a byte vector.  */
10392   for (i = 0; i < n_elts; i++)
10393     {
10394       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
10395          it must be laid out in the vector register in reverse order.  */
10396       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
10397       unsigned HOST_WIDE_INT elpart;
10398
10399       gcc_assert (CONST_INT_P (el));
10400       elpart = INTVAL (el);
10401
10402       for (unsigned int byte = 0; byte < innersize; byte++)
10403         {
10404           bytes[idx++] = (elpart & 0xff) ^ invmask;
10405           elpart >>= BITS_PER_UNIT;
10406         }
10407
10408     }
10409
10410   /* Sanity check.  */
10411   gcc_assert (idx == GET_MODE_SIZE (mode));
10412
10413   do
10414     {
10415       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
10416              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
10417
10418       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10419              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10420
10421       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
10422              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10423
10424       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
10425              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
10426
10427       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
10428
10429       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
10430
10431       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
10432              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
10433
10434       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10435              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10436
10437       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
10438              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10439
10440       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
10441              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
10442
10443       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
10444
10445       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
10446
10447       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10448              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10449
10450       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10451              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10452
10453       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10454              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10455
10456       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10457              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10458
10459       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10460
10461       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10462              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10463     }
10464   while (0);
10465
10466   if (immtype == -1)
10467     return false;
10468
10469   if (info)
10470     {
10471       info->element_width = elsize;
10472       info->mvn = emvn != 0;
10473       info->shift = eshift;
10474
10475       unsigned HOST_WIDE_INT imm = 0;
10476
10477       if (immtype >= 12 && immtype <= 15)
10478         info->msl = true;
10479
10480       /* Un-invert bytes of recognized vector, if necessary.  */
10481       if (invmask != 0)
10482         for (i = 0; i < idx; i++)
10483           bytes[i] ^= invmask;
10484
10485       if (immtype == 17)
10486         {
10487           /* FIXME: Broken on 32-bit H_W_I hosts.  */
10488           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10489
10490           for (i = 0; i < 8; i++)
10491             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10492               << (i * BITS_PER_UNIT);
10493
10494
10495           info->value = GEN_INT (imm);
10496         }
10497       else
10498         {
10499           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10500             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10501
10502           /* Construct 'abcdefgh' because the assembler cannot handle
10503              generic constants.  */
10504           if (info->mvn)
10505             imm = ~imm;
10506           imm = (imm >> info->shift) & 0xff;
10507           info->value = GEN_INT (imm);
10508         }
10509     }
10510
10511   return true;
10512 #undef CHECK
10513 }
10514
10515 /* Check of immediate shift constants are within range.  */
10516 bool
10517 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10518 {
10519   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10520   if (left)
10521     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10522   else
10523     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10524 }
10525
10526 /* Return true if X is a uniform vector where all elements
10527    are either the floating-point constant 0.0 or the
10528    integer constant 0.  */
10529 bool
10530 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10531 {
10532   return x == CONST0_RTX (mode);
10533 }
10534
10535
10536 /* Return the bitmask CONST_INT to select the bits required by a zero extract
10537    operation of width WIDTH at bit position POS.  */
10538
10539 rtx
10540 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
10541 {
10542   gcc_assert (CONST_INT_P (width));
10543   gcc_assert (CONST_INT_P (pos));
10544
10545   unsigned HOST_WIDE_INT mask
10546     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
10547   return GEN_INT (mask << UINTVAL (pos));
10548 }
10549
10550 bool
10551 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10552 {
10553   HOST_WIDE_INT imm = INTVAL (x);
10554   int i;
10555
10556   for (i = 0; i < 8; i++)
10557     {
10558       unsigned int byte = imm & 0xff;
10559       if (byte != 0xff && byte != 0)
10560        return false;
10561       imm >>= 8;
10562     }
10563
10564   return true;
10565 }
10566
10567 bool
10568 aarch64_mov_operand_p (rtx x, machine_mode mode)
10569 {
10570   if (GET_CODE (x) == HIGH
10571       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10572     return true;
10573
10574   if (CONST_INT_P (x))
10575     return true;
10576
10577   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10578     return true;
10579
10580   return aarch64_classify_symbolic_expression (x)
10581     == SYMBOL_TINY_ABSOLUTE;
10582 }
10583
10584 /* Return a const_int vector of VAL.  */
10585 rtx
10586 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10587 {
10588   int nunits = GET_MODE_NUNITS (mode);
10589   rtvec v = rtvec_alloc (nunits);
10590   int i;
10591
10592   for (i=0; i < nunits; i++)
10593     RTVEC_ELT (v, i) = GEN_INT (val);
10594
10595   return gen_rtx_CONST_VECTOR (mode, v);
10596 }
10597
10598 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
10599
10600 bool
10601 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10602 {
10603   machine_mode vmode;
10604
10605   gcc_assert (!VECTOR_MODE_P (mode));
10606   vmode = aarch64_preferred_simd_mode (mode);
10607   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10608   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10609 }
10610
10611 /* Construct and return a PARALLEL RTX vector with elements numbering the
10612    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10613    the vector - from the perspective of the architecture.  This does not
10614    line up with GCC's perspective on lane numbers, so we end up with
10615    different masks depending on our target endian-ness.  The diagram
10616    below may help.  We must draw the distinction when building masks
10617    which select one half of the vector.  An instruction selecting
10618    architectural low-lanes for a big-endian target, must be described using
10619    a mask selecting GCC high-lanes.
10620
10621                  Big-Endian             Little-Endian
10622
10623 GCC             0   1   2   3           3   2   1   0
10624               | x | x | x | x |       | x | x | x | x |
10625 Architecture    3   2   1   0           3   2   1   0
10626
10627 Low Mask:         { 2, 3 }                { 0, 1 }
10628 High Mask:        { 0, 1 }                { 2, 3 }
10629 */
10630
10631 rtx
10632 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
10633 {
10634   int nunits = GET_MODE_NUNITS (mode);
10635   rtvec v = rtvec_alloc (nunits / 2);
10636   int high_base = nunits / 2;
10637   int low_base = 0;
10638   int base;
10639   rtx t1;
10640   int i;
10641
10642   if (BYTES_BIG_ENDIAN)
10643     base = high ? low_base : high_base;
10644   else
10645     base = high ? high_base : low_base;
10646
10647   for (i = 0; i < nunits / 2; i++)
10648     RTVEC_ELT (v, i) = GEN_INT (base + i);
10649
10650   t1 = gen_rtx_PARALLEL (mode, v);
10651   return t1;
10652 }
10653
10654 /* Check OP for validity as a PARALLEL RTX vector with elements
10655    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
10656    from the perspective of the architecture.  See the diagram above
10657    aarch64_simd_vect_par_cnst_half for more details.  */
10658
10659 bool
10660 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
10661                                        bool high)
10662 {
10663   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
10664   HOST_WIDE_INT count_op = XVECLEN (op, 0);
10665   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
10666   int i = 0;
10667
10668   if (!VECTOR_MODE_P (mode))
10669     return false;
10670
10671   if (count_op != count_ideal)
10672     return false;
10673
10674   for (i = 0; i < count_ideal; i++)
10675     {
10676       rtx elt_op = XVECEXP (op, 0, i);
10677       rtx elt_ideal = XVECEXP (ideal, 0, i);
10678
10679       if (!CONST_INT_P (elt_op)
10680           || INTVAL (elt_ideal) != INTVAL (elt_op))
10681         return false;
10682     }
10683   return true;
10684 }
10685
10686 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
10687    HIGH (exclusive).  */
10688 void
10689 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
10690                           const_tree exp)
10691 {
10692   HOST_WIDE_INT lane;
10693   gcc_assert (CONST_INT_P (operand));
10694   lane = INTVAL (operand);
10695
10696   if (lane < low || lane >= high)
10697   {
10698     if (exp)
10699       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
10700     else
10701       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
10702   }
10703 }
10704
10705 /* Return TRUE if OP is a valid vector addressing mode.  */
10706 bool
10707 aarch64_simd_mem_operand_p (rtx op)
10708 {
10709   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
10710                         || REG_P (XEXP (op, 0)));
10711 }
10712
10713 /* Emit a register copy from operand to operand, taking care not to
10714    early-clobber source registers in the process.
10715
10716    COUNT is the number of components into which the copy needs to be
10717    decomposed.  */
10718 void
10719 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
10720                                 unsigned int count)
10721 {
10722   unsigned int i;
10723   int rdest = REGNO (operands[0]);
10724   int rsrc = REGNO (operands[1]);
10725
10726   if (!reg_overlap_mentioned_p (operands[0], operands[1])
10727       || rdest < rsrc)
10728     for (i = 0; i < count; i++)
10729       emit_move_insn (gen_rtx_REG (mode, rdest + i),
10730                       gen_rtx_REG (mode, rsrc + i));
10731   else
10732     for (i = 0; i < count; i++)
10733       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
10734                       gen_rtx_REG (mode, rsrc + count - i - 1));
10735 }
10736
10737 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
10738    one of VSTRUCT modes: OI, CI, or XI.  */
10739 int
10740 aarch64_simd_attr_length_rglist (enum machine_mode mode)
10741 {
10742   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
10743 }
10744
10745 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
10746    alignment of a vector to 128 bits.  */
10747 static HOST_WIDE_INT
10748 aarch64_simd_vector_alignment (const_tree type)
10749 {
10750   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
10751   return MIN (align, 128);
10752 }
10753
10754 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
10755 static bool
10756 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
10757 {
10758   if (is_packed)
10759     return false;
10760
10761   /* We guarantee alignment for vectors up to 128-bits.  */
10762   if (tree_int_cst_compare (TYPE_SIZE (type),
10763                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
10764     return false;
10765
10766   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
10767   return true;
10768 }
10769
10770 /* If VALS is a vector constant that can be loaded into a register
10771    using DUP, generate instructions to do so and return an RTX to
10772    assign to the register.  Otherwise return NULL_RTX.  */
10773 static rtx
10774 aarch64_simd_dup_constant (rtx vals)
10775 {
10776   machine_mode mode = GET_MODE (vals);
10777   machine_mode inner_mode = GET_MODE_INNER (mode);
10778   rtx x;
10779
10780   if (!const_vec_duplicate_p (vals, &x))
10781     return NULL_RTX;
10782
10783   /* We can load this constant by using DUP and a constant in a
10784      single ARM register.  This will be cheaper than a vector
10785      load.  */
10786   x = copy_to_mode_reg (inner_mode, x);
10787   return gen_rtx_VEC_DUPLICATE (mode, x);
10788 }
10789
10790
10791 /* Generate code to load VALS, which is a PARALLEL containing only
10792    constants (for vec_init) or CONST_VECTOR, efficiently into a
10793    register.  Returns an RTX to copy into the register, or NULL_RTX
10794    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
10795 static rtx
10796 aarch64_simd_make_constant (rtx vals)
10797 {
10798   machine_mode mode = GET_MODE (vals);
10799   rtx const_dup;
10800   rtx const_vec = NULL_RTX;
10801   int n_elts = GET_MODE_NUNITS (mode);
10802   int n_const = 0;
10803   int i;
10804
10805   if (GET_CODE (vals) == CONST_VECTOR)
10806     const_vec = vals;
10807   else if (GET_CODE (vals) == PARALLEL)
10808     {
10809       /* A CONST_VECTOR must contain only CONST_INTs and
10810          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
10811          Only store valid constants in a CONST_VECTOR.  */
10812       for (i = 0; i < n_elts; ++i)
10813         {
10814           rtx x = XVECEXP (vals, 0, i);
10815           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
10816             n_const++;
10817         }
10818       if (n_const == n_elts)
10819         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
10820     }
10821   else
10822     gcc_unreachable ();
10823
10824   if (const_vec != NULL_RTX
10825       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
10826     /* Load using MOVI/MVNI.  */
10827     return const_vec;
10828   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
10829     /* Loaded using DUP.  */
10830     return const_dup;
10831   else if (const_vec != NULL_RTX)
10832     /* Load from constant pool. We can not take advantage of single-cycle
10833        LD1 because we need a PC-relative addressing mode.  */
10834     return const_vec;
10835   else
10836     /* A PARALLEL containing something not valid inside CONST_VECTOR.
10837        We can not construct an initializer.  */
10838     return NULL_RTX;
10839 }
10840
10841 /* Expand a vector initialisation sequence, such that TARGET is
10842    initialised to contain VALS.  */
10843
10844 void
10845 aarch64_expand_vector_init (rtx target, rtx vals)
10846 {
10847   machine_mode mode = GET_MODE (target);
10848   machine_mode inner_mode = GET_MODE_INNER (mode);
10849   /* The number of vector elements.  */
10850   int n_elts = GET_MODE_NUNITS (mode);
10851   /* The number of vector elements which are not constant.  */
10852   int n_var = 0;
10853   rtx any_const = NULL_RTX;
10854   /* The first element of vals.  */
10855   rtx v0 = XVECEXP (vals, 0, 0);
10856   bool all_same = true;
10857
10858   /* Count the number of variable elements to initialise.  */
10859   for (int i = 0; i < n_elts; ++i)
10860     {
10861       rtx x = XVECEXP (vals, 0, i);
10862       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
10863         ++n_var;
10864       else
10865         any_const = x;
10866
10867       all_same &= rtx_equal_p (x, v0);
10868     }
10869
10870   /* No variable elements, hand off to aarch64_simd_make_constant which knows
10871      how best to handle this.  */
10872   if (n_var == 0)
10873     {
10874       rtx constant = aarch64_simd_make_constant (vals);
10875       if (constant != NULL_RTX)
10876         {
10877           emit_move_insn (target, constant);
10878           return;
10879         }
10880     }
10881
10882   /* Splat a single non-constant element if we can.  */
10883   if (all_same)
10884     {
10885       rtx x = copy_to_mode_reg (inner_mode, v0);
10886       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
10887       return;
10888     }
10889
10890   /* Initialise a vector which is part-variable.  We want to first try
10891      to build those lanes which are constant in the most efficient way we
10892      can.  */
10893   if (n_var != n_elts)
10894     {
10895       rtx copy = copy_rtx (vals);
10896
10897       /* Load constant part of vector.  We really don't care what goes into the
10898          parts we will overwrite, but we're more likely to be able to load the
10899          constant efficiently if it has fewer, larger, repeating parts
10900          (see aarch64_simd_valid_immediate).  */
10901       for (int i = 0; i < n_elts; i++)
10902         {
10903           rtx x = XVECEXP (vals, 0, i);
10904           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
10905             continue;
10906           rtx subst = any_const;
10907           for (int bit = n_elts / 2; bit > 0; bit /= 2)
10908             {
10909               /* Look in the copied vector, as more elements are const.  */
10910               rtx test = XVECEXP (copy, 0, i ^ bit);
10911               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
10912                 {
10913                   subst = test;
10914                   break;
10915                 }
10916             }
10917           XVECEXP (copy, 0, i) = subst;
10918         }
10919       aarch64_expand_vector_init (target, copy);
10920     }
10921
10922   /* Insert the variable lanes directly.  */
10923
10924   enum insn_code icode = optab_handler (vec_set_optab, mode);
10925   gcc_assert (icode != CODE_FOR_nothing);
10926
10927   for (int i = 0; i < n_elts; i++)
10928     {
10929       rtx x = XVECEXP (vals, 0, i);
10930       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
10931         continue;
10932       x = copy_to_mode_reg (inner_mode, x);
10933       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
10934     }
10935 }
10936
10937 static unsigned HOST_WIDE_INT
10938 aarch64_shift_truncation_mask (machine_mode mode)
10939 {
10940   return
10941     (!SHIFT_COUNT_TRUNCATED
10942      || aarch64_vector_mode_supported_p (mode)
10943      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
10944 }
10945
10946 /* Select a format to encode pointers in exception handling data.  */
10947 int
10948 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
10949 {
10950    int type;
10951    switch (aarch64_cmodel)
10952      {
10953      case AARCH64_CMODEL_TINY:
10954      case AARCH64_CMODEL_TINY_PIC:
10955      case AARCH64_CMODEL_SMALL:
10956      case AARCH64_CMODEL_SMALL_PIC:
10957      case AARCH64_CMODEL_SMALL_SPIC:
10958        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
10959           for everything.  */
10960        type = DW_EH_PE_sdata4;
10961        break;
10962      default:
10963        /* No assumptions here.  8-byte relocs required.  */
10964        type = DW_EH_PE_sdata8;
10965        break;
10966      }
10967    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
10968 }
10969
10970 /* The last .arch and .tune assembly strings that we printed.  */
10971 static std::string aarch64_last_printed_arch_string;
10972 static std::string aarch64_last_printed_tune_string;
10973
10974 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
10975    by the function fndecl.  */
10976
10977 void
10978 aarch64_declare_function_name (FILE *stream, const char* name,
10979                                 tree fndecl)
10980 {
10981   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10982
10983   struct cl_target_option *targ_options;
10984   if (target_parts)
10985     targ_options = TREE_TARGET_OPTION (target_parts);
10986   else
10987     targ_options = TREE_TARGET_OPTION (target_option_current_node);
10988   gcc_assert (targ_options);
10989
10990   const struct processor *this_arch
10991     = aarch64_get_arch (targ_options->x_explicit_arch);
10992
10993   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
10994   std::string extension
10995     = aarch64_get_extension_string_for_isa_flags (isa_flags,
10996                                                   this_arch->flags);
10997   /* Only update the assembler .arch string if it is distinct from the last
10998      such string we printed.  */
10999   std::string to_print = this_arch->name + extension;
11000   if (to_print != aarch64_last_printed_arch_string)
11001     {
11002       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11003       aarch64_last_printed_arch_string = to_print;
11004     }
11005
11006   /* Print the cpu name we're tuning for in the comments, might be
11007      useful to readers of the generated asm.  Do it only when it changes
11008      from function to function and verbose assembly is requested.  */
11009   const struct processor *this_tune
11010     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11011
11012   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11013     {
11014       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11015                    this_tune->name);
11016       aarch64_last_printed_tune_string = this_tune->name;
11017     }
11018
11019   /* Don't forget the type directive for ELF.  */
11020   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11021   ASM_OUTPUT_LABEL (stream, name);
11022 }
11023
11024 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
11025
11026 static void
11027 aarch64_start_file (void)
11028 {
11029   struct cl_target_option *default_options
11030     = TREE_TARGET_OPTION (target_option_default_node);
11031
11032   const struct processor *default_arch
11033     = aarch64_get_arch (default_options->x_explicit_arch);
11034   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11035   std::string extension
11036     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11037                                                   default_arch->flags);
11038
11039    aarch64_last_printed_arch_string = default_arch->name + extension;
11040    aarch64_last_printed_tune_string = "";
11041    asm_fprintf (asm_out_file, "\t.arch %s\n",
11042                 aarch64_last_printed_arch_string.c_str ());
11043
11044    default_file_start ();
11045 }
11046
11047 /* Emit load exclusive.  */
11048
11049 static void
11050 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11051                              rtx mem, rtx model_rtx)
11052 {
11053   rtx (*gen) (rtx, rtx, rtx);
11054
11055   switch (mode)
11056     {
11057     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11058     case HImode: gen = gen_aarch64_load_exclusivehi; break;
11059     case SImode: gen = gen_aarch64_load_exclusivesi; break;
11060     case DImode: gen = gen_aarch64_load_exclusivedi; break;
11061     default:
11062       gcc_unreachable ();
11063     }
11064
11065   emit_insn (gen (rval, mem, model_rtx));
11066 }
11067
11068 /* Emit store exclusive.  */
11069
11070 static void
11071 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11072                               rtx rval, rtx mem, rtx model_rtx)
11073 {
11074   rtx (*gen) (rtx, rtx, rtx, rtx);
11075
11076   switch (mode)
11077     {
11078     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11079     case HImode: gen = gen_aarch64_store_exclusivehi; break;
11080     case SImode: gen = gen_aarch64_store_exclusivesi; break;
11081     case DImode: gen = gen_aarch64_store_exclusivedi; break;
11082     default:
11083       gcc_unreachable ();
11084     }
11085
11086   emit_insn (gen (bval, rval, mem, model_rtx));
11087 }
11088
11089 /* Mark the previous jump instruction as unlikely.  */
11090
11091 static void
11092 aarch64_emit_unlikely_jump (rtx insn)
11093 {
11094   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11095
11096   insn = emit_jump_insn (insn);
11097   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
11098 }
11099
11100 /* Expand a compare and swap pattern.  */
11101
11102 void
11103 aarch64_expand_compare_and_swap (rtx operands[])
11104 {
11105   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11106   machine_mode mode, cmp_mode;
11107   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11108   int idx;
11109   gen_cas_fn gen;
11110   const gen_cas_fn split_cas[] =
11111   {
11112     gen_aarch64_compare_and_swapqi,
11113     gen_aarch64_compare_and_swaphi,
11114     gen_aarch64_compare_and_swapsi,
11115     gen_aarch64_compare_and_swapdi
11116   };
11117   const gen_cas_fn atomic_cas[] =
11118   {
11119     gen_aarch64_compare_and_swapqi_lse,
11120     gen_aarch64_compare_and_swaphi_lse,
11121     gen_aarch64_compare_and_swapsi_lse,
11122     gen_aarch64_compare_and_swapdi_lse
11123   };
11124
11125   bval = operands[0];
11126   rval = operands[1];
11127   mem = operands[2];
11128   oldval = operands[3];
11129   newval = operands[4];
11130   is_weak = operands[5];
11131   mod_s = operands[6];
11132   mod_f = operands[7];
11133   mode = GET_MODE (mem);
11134   cmp_mode = mode;
11135
11136   /* Normally the succ memory model must be stronger than fail, but in the
11137      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11138      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
11139
11140   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11141       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11142     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11143
11144   switch (mode)
11145     {
11146     case QImode:
11147     case HImode:
11148       /* For short modes, we're going to perform the comparison in SImode,
11149          so do the zero-extension now.  */
11150       cmp_mode = SImode;
11151       rval = gen_reg_rtx (SImode);
11152       oldval = convert_modes (SImode, mode, oldval, true);
11153       /* Fall through.  */
11154
11155     case SImode:
11156     case DImode:
11157       /* Force the value into a register if needed.  */
11158       if (!aarch64_plus_operand (oldval, mode))
11159         oldval = force_reg (cmp_mode, oldval);
11160       break;
11161
11162     default:
11163       gcc_unreachable ();
11164     }
11165
11166   switch (mode)
11167     {
11168     case QImode: idx = 0; break;
11169     case HImode: idx = 1; break;
11170     case SImode: idx = 2; break;
11171     case DImode: idx = 3; break;
11172     default:
11173       gcc_unreachable ();
11174     }
11175   if (TARGET_LSE)
11176     gen = atomic_cas[idx];
11177   else
11178     gen = split_cas[idx];
11179
11180   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11181
11182   if (mode == QImode || mode == HImode)
11183     emit_move_insn (operands[1], gen_lowpart (mode, rval));
11184
11185   x = gen_rtx_REG (CCmode, CC_REGNUM);
11186   x = gen_rtx_EQ (SImode, x, const0_rtx);
11187   emit_insn (gen_rtx_SET (bval, x));
11188 }
11189
11190 /* Test whether the target supports using a atomic load-operate instruction.
11191    CODE is the operation and AFTER is TRUE if the data in memory after the
11192    operation should be returned and FALSE if the data before the operation
11193    should be returned.  Returns FALSE if the operation isn't supported by the
11194    architecture.  */
11195
11196 bool
11197 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11198 {
11199   if (!TARGET_LSE)
11200     return false;
11201
11202   switch (code)
11203     {
11204     case SET:
11205     case AND:
11206     case IOR:
11207     case XOR:
11208     case MINUS:
11209     case PLUS:
11210       return true;
11211     default:
11212       return false;
11213     }
11214 }
11215
11216 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11217    sequence implementing an atomic operation.  */
11218
11219 static void
11220 aarch64_emit_post_barrier (enum memmodel model)
11221 {
11222   const enum memmodel base_model = memmodel_base (model);
11223
11224   if (is_mm_sync (model)
11225       && (base_model == MEMMODEL_ACQUIRE
11226           || base_model == MEMMODEL_ACQ_REL
11227           || base_model == MEMMODEL_SEQ_CST))
11228     {
11229       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11230     }
11231 }
11232
11233 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
11234    for the data in memory.  EXPECTED is the value expected to be in memory.
11235    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
11236    is the memory ordering to use.  */
11237
11238 void
11239 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11240                         rtx expected, rtx desired,
11241                         rtx model)
11242 {
11243   rtx (*gen) (rtx, rtx, rtx, rtx);
11244   machine_mode mode;
11245
11246   mode = GET_MODE (mem);
11247
11248   switch (mode)
11249     {
11250     case QImode: gen = gen_aarch64_atomic_casqi; break;
11251     case HImode: gen = gen_aarch64_atomic_cashi; break;
11252     case SImode: gen = gen_aarch64_atomic_cassi; break;
11253     case DImode: gen = gen_aarch64_atomic_casdi; break;
11254     default:
11255       gcc_unreachable ();
11256     }
11257
11258   /* Move the expected value into the CAS destination register.  */
11259   emit_insn (gen_rtx_SET (rval, expected));
11260
11261   /* Emit the CAS.  */
11262   emit_insn (gen (rval, mem, desired, model));
11263
11264   /* Compare the expected value with the value loaded by the CAS, to establish
11265      whether the swap was made.  */
11266   aarch64_gen_compare_reg (EQ, rval, expected);
11267 }
11268
11269 /* Split a compare and swap pattern.  */
11270
11271 void
11272 aarch64_split_compare_and_swap (rtx operands[])
11273 {
11274   rtx rval, mem, oldval, newval, scratch;
11275   machine_mode mode;
11276   bool is_weak;
11277   rtx_code_label *label1, *label2;
11278   rtx x, cond;
11279   enum memmodel model;
11280   rtx model_rtx;
11281
11282   rval = operands[0];
11283   mem = operands[1];
11284   oldval = operands[2];
11285   newval = operands[3];
11286   is_weak = (operands[4] != const0_rtx);
11287   model_rtx = operands[5];
11288   scratch = operands[7];
11289   mode = GET_MODE (mem);
11290   model = memmodel_from_int (INTVAL (model_rtx));
11291
11292   label1 = NULL;
11293   if (!is_weak)
11294     {
11295       label1 = gen_label_rtx ();
11296       emit_label (label1);
11297     }
11298   label2 = gen_label_rtx ();
11299
11300   /* The initial load can be relaxed for a __sync operation since a final
11301      barrier will be emitted to stop code hoisting.  */
11302   if (is_mm_sync (model))
11303     aarch64_emit_load_exclusive (mode, rval, mem,
11304                                  GEN_INT (MEMMODEL_RELAXED));
11305   else
11306     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11307
11308   cond = aarch64_gen_compare_reg (NE, rval, oldval);
11309   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11310   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11311                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11312   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11313
11314   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11315
11316   if (!is_weak)
11317     {
11318       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11319       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11320                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11321       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11322     }
11323   else
11324     {
11325       cond = gen_rtx_REG (CCmode, CC_REGNUM);
11326       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11327       emit_insn (gen_rtx_SET (cond, x));
11328     }
11329
11330   emit_label (label2);
11331
11332   /* Emit any final barrier needed for a __sync operation.  */
11333   if (is_mm_sync (model))
11334     aarch64_emit_post_barrier (model);
11335 }
11336
11337 /* Emit a BIC instruction.  */
11338
11339 static void
11340 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11341 {
11342   rtx shift_rtx = GEN_INT (shift);
11343   rtx (*gen) (rtx, rtx, rtx, rtx);
11344
11345   switch (mode)
11346     {
11347     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11348     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11349     default:
11350       gcc_unreachable ();
11351     }
11352
11353   emit_insn (gen (dst, s2, shift_rtx, s1));
11354 }
11355
11356 /* Emit an atomic swap.  */
11357
11358 static void
11359 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
11360                           rtx mem, rtx model)
11361 {
11362   rtx (*gen) (rtx, rtx, rtx, rtx);
11363
11364   switch (mode)
11365     {
11366     case QImode: gen = gen_aarch64_atomic_swpqi; break;
11367     case HImode: gen = gen_aarch64_atomic_swphi; break;
11368     case SImode: gen = gen_aarch64_atomic_swpsi; break;
11369     case DImode: gen = gen_aarch64_atomic_swpdi; break;
11370     default:
11371       gcc_unreachable ();
11372     }
11373
11374   emit_insn (gen (dst, mem, value, model));
11375 }
11376
11377 /* Operations supported by aarch64_emit_atomic_load_op.  */
11378
11379 enum aarch64_atomic_load_op_code
11380 {
11381   AARCH64_LDOP_PLUS,    /* A + B  */
11382   AARCH64_LDOP_XOR,     /* A ^ B  */
11383   AARCH64_LDOP_OR,      /* A | B  */
11384   AARCH64_LDOP_BIC      /* A & ~B  */
11385 };
11386
11387 /* Emit an atomic load-operate.  */
11388
11389 static void
11390 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
11391                              machine_mode mode, rtx dst, rtx src,
11392                              rtx mem, rtx model)
11393 {
11394   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
11395   const aarch64_atomic_load_op_fn plus[] =
11396   {
11397     gen_aarch64_atomic_loadaddqi,
11398     gen_aarch64_atomic_loadaddhi,
11399     gen_aarch64_atomic_loadaddsi,
11400     gen_aarch64_atomic_loadadddi
11401   };
11402   const aarch64_atomic_load_op_fn eor[] =
11403   {
11404     gen_aarch64_atomic_loadeorqi,
11405     gen_aarch64_atomic_loadeorhi,
11406     gen_aarch64_atomic_loadeorsi,
11407     gen_aarch64_atomic_loadeordi
11408   };
11409   const aarch64_atomic_load_op_fn ior[] =
11410   {
11411     gen_aarch64_atomic_loadsetqi,
11412     gen_aarch64_atomic_loadsethi,
11413     gen_aarch64_atomic_loadsetsi,
11414     gen_aarch64_atomic_loadsetdi
11415   };
11416   const aarch64_atomic_load_op_fn bic[] =
11417   {
11418     gen_aarch64_atomic_loadclrqi,
11419     gen_aarch64_atomic_loadclrhi,
11420     gen_aarch64_atomic_loadclrsi,
11421     gen_aarch64_atomic_loadclrdi
11422   };
11423   aarch64_atomic_load_op_fn gen;
11424   int idx = 0;
11425
11426   switch (mode)
11427     {
11428     case QImode: idx = 0; break;
11429     case HImode: idx = 1; break;
11430     case SImode: idx = 2; break;
11431     case DImode: idx = 3; break;
11432     default:
11433       gcc_unreachable ();
11434     }
11435
11436   switch (code)
11437     {
11438     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
11439     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
11440     case AARCH64_LDOP_OR: gen = ior[idx]; break;
11441     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
11442     default:
11443       gcc_unreachable ();
11444     }
11445
11446   emit_insn (gen (dst, mem, src, model));
11447 }
11448
11449 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
11450    location to store the data read from memory.  OUT_RESULT is the location to
11451    store the result of the operation.  MEM is the memory location to read and
11452    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
11453    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
11454    be NULL.  */
11455
11456 void
11457 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
11458                          rtx mem, rtx value, rtx model_rtx)
11459 {
11460   machine_mode mode = GET_MODE (mem);
11461   machine_mode wmode = (mode == DImode ? DImode : SImode);
11462   const bool short_mode = (mode < SImode);
11463   aarch64_atomic_load_op_code ldop_code;
11464   rtx src;
11465   rtx x;
11466
11467   if (out_data)
11468     out_data = gen_lowpart (mode, out_data);
11469
11470   if (out_result)
11471     out_result = gen_lowpart (mode, out_result);
11472
11473   /* Make sure the value is in a register, putting it into a destination
11474      register if it needs to be manipulated.  */
11475   if (!register_operand (value, mode)
11476       || code == AND || code == MINUS)
11477     {
11478       src = out_result ? out_result : out_data;
11479       emit_move_insn (src, gen_lowpart (mode, value));
11480     }
11481   else
11482     src = value;
11483   gcc_assert (register_operand (src, mode));
11484
11485   /* Preprocess the data for the operation as necessary.  If the operation is
11486      a SET then emit a swap instruction and finish.  */
11487   switch (code)
11488     {
11489     case SET:
11490       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
11491       return;
11492
11493     case MINUS:
11494       /* Negate the value and treat it as a PLUS.  */
11495       {
11496         rtx neg_src;
11497
11498         /* Resize the value if necessary.  */
11499         if (short_mode)
11500           src = gen_lowpart (wmode, src);
11501
11502         neg_src = gen_rtx_NEG (wmode, src);
11503         emit_insn (gen_rtx_SET (src, neg_src));
11504
11505         if (short_mode)
11506           src = gen_lowpart (mode, src);
11507       }
11508       /* Fall-through.  */
11509     case PLUS:
11510       ldop_code = AARCH64_LDOP_PLUS;
11511       break;
11512
11513     case IOR:
11514       ldop_code = AARCH64_LDOP_OR;
11515       break;
11516
11517     case XOR:
11518       ldop_code = AARCH64_LDOP_XOR;
11519       break;
11520
11521     case AND:
11522       {
11523         rtx not_src;
11524
11525         /* Resize the value if necessary.  */
11526         if (short_mode)
11527           src = gen_lowpart (wmode, src);
11528
11529         not_src = gen_rtx_NOT (wmode, src);
11530         emit_insn (gen_rtx_SET (src, not_src));
11531
11532         if (short_mode)
11533           src = gen_lowpart (mode, src);
11534       }
11535       ldop_code = AARCH64_LDOP_BIC;
11536       break;
11537
11538     default:
11539       /* The operation can't be done with atomic instructions.  */
11540       gcc_unreachable ();
11541     }
11542
11543   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
11544
11545   /* If necessary, calculate the data in memory after the update by redoing the
11546      operation from values in registers.  */
11547   if (!out_result)
11548     return;
11549
11550   if (short_mode)
11551     {
11552       src = gen_lowpart (wmode, src);
11553       out_data = gen_lowpart (wmode, out_data);
11554       out_result = gen_lowpart (wmode, out_result);
11555     }
11556
11557   x = NULL_RTX;
11558
11559   switch (code)
11560     {
11561     case MINUS:
11562     case PLUS:
11563       x = gen_rtx_PLUS (wmode, out_data, src);
11564       break;
11565     case IOR:
11566       x = gen_rtx_IOR (wmode, out_data, src);
11567       break;
11568     case XOR:
11569       x = gen_rtx_XOR (wmode, out_data, src);
11570       break;
11571     case AND:
11572       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
11573       return;
11574     default:
11575       gcc_unreachable ();
11576     }
11577
11578   emit_set_insn (out_result, x);
11579
11580   return;
11581 }
11582
11583 /* Split an atomic operation.  */
11584
11585 void
11586 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
11587                          rtx value, rtx model_rtx, rtx cond)
11588 {
11589   machine_mode mode = GET_MODE (mem);
11590   machine_mode wmode = (mode == DImode ? DImode : SImode);
11591   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
11592   const bool is_sync = is_mm_sync (model);
11593   rtx_code_label *label;
11594   rtx x;
11595
11596   /* Split the atomic operation into a sequence.  */
11597   label = gen_label_rtx ();
11598   emit_label (label);
11599
11600   if (new_out)
11601     new_out = gen_lowpart (wmode, new_out);
11602   if (old_out)
11603     old_out = gen_lowpart (wmode, old_out);
11604   else
11605     old_out = new_out;
11606   value = simplify_gen_subreg (wmode, value, mode, 0);
11607
11608   /* The initial load can be relaxed for a __sync operation since a final
11609      barrier will be emitted to stop code hoisting.  */
11610  if (is_sync)
11611     aarch64_emit_load_exclusive (mode, old_out, mem,
11612                                  GEN_INT (MEMMODEL_RELAXED));
11613   else
11614     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
11615
11616   switch (code)
11617     {
11618     case SET:
11619       new_out = value;
11620       break;
11621
11622     case NOT:
11623       x = gen_rtx_AND (wmode, old_out, value);
11624       emit_insn (gen_rtx_SET (new_out, x));
11625       x = gen_rtx_NOT (wmode, new_out);
11626       emit_insn (gen_rtx_SET (new_out, x));
11627       break;
11628
11629     case MINUS:
11630       if (CONST_INT_P (value))
11631         {
11632           value = GEN_INT (-INTVAL (value));
11633           code = PLUS;
11634         }
11635       /* Fall through.  */
11636
11637     default:
11638       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
11639       emit_insn (gen_rtx_SET (new_out, x));
11640       break;
11641     }
11642
11643   aarch64_emit_store_exclusive (mode, cond, mem,
11644                                 gen_lowpart (mode, new_out), model_rtx);
11645
11646   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11647   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11648                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
11649   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11650
11651   /* Emit any final barrier needed for a __sync operation.  */
11652   if (is_sync)
11653     aarch64_emit_post_barrier (model);
11654 }
11655
11656 static void
11657 aarch64_init_libfuncs (void)
11658 {
11659    /* Half-precision float operations.  The compiler handles all operations
11660      with NULL libfuncs by converting to SFmode.  */
11661
11662   /* Conversions.  */
11663   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
11664   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
11665
11666   /* Arithmetic.  */
11667   set_optab_libfunc (add_optab, HFmode, NULL);
11668   set_optab_libfunc (sdiv_optab, HFmode, NULL);
11669   set_optab_libfunc (smul_optab, HFmode, NULL);
11670   set_optab_libfunc (neg_optab, HFmode, NULL);
11671   set_optab_libfunc (sub_optab, HFmode, NULL);
11672
11673   /* Comparisons.  */
11674   set_optab_libfunc (eq_optab, HFmode, NULL);
11675   set_optab_libfunc (ne_optab, HFmode, NULL);
11676   set_optab_libfunc (lt_optab, HFmode, NULL);
11677   set_optab_libfunc (le_optab, HFmode, NULL);
11678   set_optab_libfunc (ge_optab, HFmode, NULL);
11679   set_optab_libfunc (gt_optab, HFmode, NULL);
11680   set_optab_libfunc (unord_optab, HFmode, NULL);
11681 }
11682
11683 /* Target hook for c_mode_for_suffix.  */
11684 static machine_mode
11685 aarch64_c_mode_for_suffix (char suffix)
11686 {
11687   if (suffix == 'q')
11688     return TFmode;
11689
11690   return VOIDmode;
11691 }
11692
11693 /* We can only represent floating point constants which will fit in
11694    "quarter-precision" values.  These values are characterised by
11695    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
11696    by:
11697
11698    (-1)^s * (n/16) * 2^r
11699
11700    Where:
11701      's' is the sign bit.
11702      'n' is an integer in the range 16 <= n <= 31.
11703      'r' is an integer in the range -3 <= r <= 4.  */
11704
11705 /* Return true iff X can be represented by a quarter-precision
11706    floating point immediate operand X.  Note, we cannot represent 0.0.  */
11707 bool
11708 aarch64_float_const_representable_p (rtx x)
11709 {
11710   /* This represents our current view of how many bits
11711      make up the mantissa.  */
11712   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
11713   int exponent;
11714   unsigned HOST_WIDE_INT mantissa, mask;
11715   REAL_VALUE_TYPE r, m;
11716   bool fail;
11717
11718   if (!CONST_DOUBLE_P (x))
11719     return false;
11720
11721   /* We don't support HFmode constants yet.  */
11722   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
11723     return false;
11724
11725   r = *CONST_DOUBLE_REAL_VALUE (x);
11726
11727   /* We cannot represent infinities, NaNs or +/-zero.  We won't
11728      know if we have +zero until we analyse the mantissa, but we
11729      can reject the other invalid values.  */
11730   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
11731       || REAL_VALUE_MINUS_ZERO (r))
11732     return false;
11733
11734   /* Extract exponent.  */
11735   r = real_value_abs (&r);
11736   exponent = REAL_EXP (&r);
11737
11738   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
11739      highest (sign) bit, with a fixed binary point at bit point_pos.
11740      m1 holds the low part of the mantissa, m2 the high part.
11741      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
11742      bits for the mantissa, this can fail (low bits will be lost).  */
11743   real_ldexp (&m, &r, point_pos - exponent);
11744   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
11745
11746   /* If the low part of the mantissa has bits set we cannot represent
11747      the value.  */
11748   if (w.elt (0) != 0)
11749     return false;
11750   /* We have rejected the lower HOST_WIDE_INT, so update our
11751      understanding of how many bits lie in the mantissa and
11752      look only at the high HOST_WIDE_INT.  */
11753   mantissa = w.elt (1);
11754   point_pos -= HOST_BITS_PER_WIDE_INT;
11755
11756   /* We can only represent values with a mantissa of the form 1.xxxx.  */
11757   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
11758   if ((mantissa & mask) != 0)
11759     return false;
11760
11761   /* Having filtered unrepresentable values, we may now remove all
11762      but the highest 5 bits.  */
11763   mantissa >>= point_pos - 5;
11764
11765   /* We cannot represent the value 0.0, so reject it.  This is handled
11766      elsewhere.  */
11767   if (mantissa == 0)
11768     return false;
11769
11770   /* Then, as bit 4 is always set, we can mask it off, leaving
11771      the mantissa in the range [0, 15].  */
11772   mantissa &= ~(1 << 4);
11773   gcc_assert (mantissa <= 15);
11774
11775   /* GCC internally does not use IEEE754-like encoding (where normalized
11776      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
11777      Our mantissa values are shifted 4 places to the left relative to
11778      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
11779      by 5 places to correct for GCC's representation.  */
11780   exponent = 5 - exponent;
11781
11782   return (exponent >= 0 && exponent <= 7);
11783 }
11784
11785 char*
11786 aarch64_output_simd_mov_immediate (rtx const_vector,
11787                                    machine_mode mode,
11788                                    unsigned width)
11789 {
11790   bool is_valid;
11791   static char templ[40];
11792   const char *mnemonic;
11793   const char *shift_op;
11794   unsigned int lane_count = 0;
11795   char element_char;
11796
11797   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
11798
11799   /* This will return true to show const_vector is legal for use as either
11800      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
11801      also update INFO to show how the immediate should be generated.  */
11802   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
11803   gcc_assert (is_valid);
11804
11805   element_char = sizetochar (info.element_width);
11806   lane_count = width / info.element_width;
11807
11808   mode = GET_MODE_INNER (mode);
11809   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11810     {
11811       gcc_assert (info.shift == 0 && ! info.mvn);
11812       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
11813          move immediate path.  */
11814       if (aarch64_float_const_zero_rtx_p (info.value))
11815         info.value = GEN_INT (0);
11816       else
11817         {
11818           const unsigned int buf_size = 20;
11819           char float_buf[buf_size] = {'\0'};
11820           real_to_decimal_for_mode (float_buf,
11821                                     CONST_DOUBLE_REAL_VALUE (info.value),
11822                                     buf_size, buf_size, 1, mode);
11823
11824           if (lane_count == 1)
11825             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
11826           else
11827             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
11828                       lane_count, element_char, float_buf);
11829           return templ;
11830         }
11831     }
11832
11833   mnemonic = info.mvn ? "mvni" : "movi";
11834   shift_op = info.msl ? "msl" : "lsl";
11835
11836   gcc_assert (CONST_INT_P (info.value));
11837   if (lane_count == 1)
11838     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
11839               mnemonic, UINTVAL (info.value));
11840   else if (info.shift)
11841     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
11842               ", %s %d", mnemonic, lane_count, element_char,
11843               UINTVAL (info.value), shift_op, info.shift);
11844   else
11845     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
11846               mnemonic, lane_count, element_char, UINTVAL (info.value));
11847   return templ;
11848 }
11849
11850 char*
11851 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
11852                                           machine_mode mode)
11853 {
11854   machine_mode vmode;
11855
11856   gcc_assert (!VECTOR_MODE_P (mode));
11857   vmode = aarch64_simd_container_mode (mode, 64);
11858   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
11859   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
11860 }
11861
11862 /* Split operands into moves from op[1] + op[2] into op[0].  */
11863
11864 void
11865 aarch64_split_combinev16qi (rtx operands[3])
11866 {
11867   unsigned int dest = REGNO (operands[0]);
11868   unsigned int src1 = REGNO (operands[1]);
11869   unsigned int src2 = REGNO (operands[2]);
11870   machine_mode halfmode = GET_MODE (operands[1]);
11871   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
11872   rtx destlo, desthi;
11873
11874   gcc_assert (halfmode == V16QImode);
11875
11876   if (src1 == dest && src2 == dest + halfregs)
11877     {
11878       /* No-op move.  Can't split to nothing; emit something.  */
11879       emit_note (NOTE_INSN_DELETED);
11880       return;
11881     }
11882
11883   /* Preserve register attributes for variable tracking.  */
11884   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
11885   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
11886                                GET_MODE_SIZE (halfmode));
11887
11888   /* Special case of reversed high/low parts.  */
11889   if (reg_overlap_mentioned_p (operands[2], destlo)
11890       && reg_overlap_mentioned_p (operands[1], desthi))
11891     {
11892       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
11893       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
11894       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
11895     }
11896   else if (!reg_overlap_mentioned_p (operands[2], destlo))
11897     {
11898       /* Try to avoid unnecessary moves if part of the result
11899          is in the right place already.  */
11900       if (src1 != dest)
11901         emit_move_insn (destlo, operands[1]);
11902       if (src2 != dest + halfregs)
11903         emit_move_insn (desthi, operands[2]);
11904     }
11905   else
11906     {
11907       if (src2 != dest + halfregs)
11908         emit_move_insn (desthi, operands[2]);
11909       if (src1 != dest)
11910         emit_move_insn (destlo, operands[1]);
11911     }
11912 }
11913
11914 /* vec_perm support.  */
11915
11916 #define MAX_VECT_LEN 16
11917
11918 struct expand_vec_perm_d
11919 {
11920   rtx target, op0, op1;
11921   unsigned char perm[MAX_VECT_LEN];
11922   machine_mode vmode;
11923   unsigned char nelt;
11924   bool one_vector_p;
11925   bool testing_p;
11926 };
11927
11928 /* Generate a variable permutation.  */
11929
11930 static void
11931 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
11932 {
11933   machine_mode vmode = GET_MODE (target);
11934   bool one_vector_p = rtx_equal_p (op0, op1);
11935
11936   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
11937   gcc_checking_assert (GET_MODE (op0) == vmode);
11938   gcc_checking_assert (GET_MODE (op1) == vmode);
11939   gcc_checking_assert (GET_MODE (sel) == vmode);
11940   gcc_checking_assert (TARGET_SIMD);
11941
11942   if (one_vector_p)
11943     {
11944       if (vmode == V8QImode)
11945         {
11946           /* Expand the argument to a V16QI mode by duplicating it.  */
11947           rtx pair = gen_reg_rtx (V16QImode);
11948           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
11949           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
11950         }
11951       else
11952         {
11953           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
11954         }
11955     }
11956   else
11957     {
11958       rtx pair;
11959
11960       if (vmode == V8QImode)
11961         {
11962           pair = gen_reg_rtx (V16QImode);
11963           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
11964           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
11965         }
11966       else
11967         {
11968           pair = gen_reg_rtx (OImode);
11969           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
11970           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
11971         }
11972     }
11973 }
11974
11975 void
11976 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
11977 {
11978   machine_mode vmode = GET_MODE (target);
11979   unsigned int nelt = GET_MODE_NUNITS (vmode);
11980   bool one_vector_p = rtx_equal_p (op0, op1);
11981   rtx mask;
11982
11983   /* The TBL instruction does not use a modulo index, so we must take care
11984      of that ourselves.  */
11985   mask = aarch64_simd_gen_const_vector_dup (vmode,
11986       one_vector_p ? nelt - 1 : 2 * nelt - 1);
11987   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
11988
11989   /* For big-endian, we also need to reverse the index within the vector
11990      (but not which vector).  */
11991   if (BYTES_BIG_ENDIAN)
11992     {
11993       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
11994       if (!one_vector_p)
11995         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
11996       sel = expand_simple_binop (vmode, XOR, sel, mask,
11997                                  NULL, 0, OPTAB_LIB_WIDEN);
11998     }
11999   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12000 }
12001
12002 /* Recognize patterns suitable for the TRN instructions.  */
12003 static bool
12004 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12005 {
12006   unsigned int i, odd, mask, nelt = d->nelt;
12007   rtx out, in0, in1, x;
12008   rtx (*gen) (rtx, rtx, rtx);
12009   machine_mode vmode = d->vmode;
12010
12011   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12012     return false;
12013
12014   /* Note that these are little-endian tests.
12015      We correct for big-endian later.  */
12016   if (d->perm[0] == 0)
12017     odd = 0;
12018   else if (d->perm[0] == 1)
12019     odd = 1;
12020   else
12021     return false;
12022   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12023
12024   for (i = 0; i < nelt; i += 2)
12025     {
12026       if (d->perm[i] != i + odd)
12027         return false;
12028       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12029         return false;
12030     }
12031
12032   /* Success!  */
12033   if (d->testing_p)
12034     return true;
12035
12036   in0 = d->op0;
12037   in1 = d->op1;
12038   if (BYTES_BIG_ENDIAN)
12039     {
12040       x = in0, in0 = in1, in1 = x;
12041       odd = !odd;
12042     }
12043   out = d->target;
12044
12045   if (odd)
12046     {
12047       switch (vmode)
12048         {
12049         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12050         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12051         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12052         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12053         case V4SImode: gen = gen_aarch64_trn2v4si; break;
12054         case V2SImode: gen = gen_aarch64_trn2v2si; break;
12055         case V2DImode: gen = gen_aarch64_trn2v2di; break;
12056         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12057         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12058         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12059         default:
12060           return false;
12061         }
12062     }
12063   else
12064     {
12065       switch (vmode)
12066         {
12067         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12068         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12069         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12070         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12071         case V4SImode: gen = gen_aarch64_trn1v4si; break;
12072         case V2SImode: gen = gen_aarch64_trn1v2si; break;
12073         case V2DImode: gen = gen_aarch64_trn1v2di; break;
12074         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12075         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12076         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12077         default:
12078           return false;
12079         }
12080     }
12081
12082   emit_insn (gen (out, in0, in1));
12083   return true;
12084 }
12085
12086 /* Recognize patterns suitable for the UZP instructions.  */
12087 static bool
12088 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12089 {
12090   unsigned int i, odd, mask, nelt = d->nelt;
12091   rtx out, in0, in1, x;
12092   rtx (*gen) (rtx, rtx, rtx);
12093   machine_mode vmode = d->vmode;
12094
12095   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12096     return false;
12097
12098   /* Note that these are little-endian tests.
12099      We correct for big-endian later.  */
12100   if (d->perm[0] == 0)
12101     odd = 0;
12102   else if (d->perm[0] == 1)
12103     odd = 1;
12104   else
12105     return false;
12106   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12107
12108   for (i = 0; i < nelt; i++)
12109     {
12110       unsigned elt = (i * 2 + odd) & mask;
12111       if (d->perm[i] != elt)
12112         return false;
12113     }
12114
12115   /* Success!  */
12116   if (d->testing_p)
12117     return true;
12118
12119   in0 = d->op0;
12120   in1 = d->op1;
12121   if (BYTES_BIG_ENDIAN)
12122     {
12123       x = in0, in0 = in1, in1 = x;
12124       odd = !odd;
12125     }
12126   out = d->target;
12127
12128   if (odd)
12129     {
12130       switch (vmode)
12131         {
12132         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12133         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12134         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12135         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12136         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12137         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12138         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12139         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12140         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12141         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12142         default:
12143           return false;
12144         }
12145     }
12146   else
12147     {
12148       switch (vmode)
12149         {
12150         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12151         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12152         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12153         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12154         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12155         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12156         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12157         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12158         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12159         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12160         default:
12161           return false;
12162         }
12163     }
12164
12165   emit_insn (gen (out, in0, in1));
12166   return true;
12167 }
12168
12169 /* Recognize patterns suitable for the ZIP instructions.  */
12170 static bool
12171 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12172 {
12173   unsigned int i, high, mask, nelt = d->nelt;
12174   rtx out, in0, in1, x;
12175   rtx (*gen) (rtx, rtx, rtx);
12176   machine_mode vmode = d->vmode;
12177
12178   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12179     return false;
12180
12181   /* Note that these are little-endian tests.
12182      We correct for big-endian later.  */
12183   high = nelt / 2;
12184   if (d->perm[0] == high)
12185     /* Do Nothing.  */
12186     ;
12187   else if (d->perm[0] == 0)
12188     high = 0;
12189   else
12190     return false;
12191   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12192
12193   for (i = 0; i < nelt / 2; i++)
12194     {
12195       unsigned elt = (i + high) & mask;
12196       if (d->perm[i * 2] != elt)
12197         return false;
12198       elt = (elt + nelt) & mask;
12199       if (d->perm[i * 2 + 1] != elt)
12200         return false;
12201     }
12202
12203   /* Success!  */
12204   if (d->testing_p)
12205     return true;
12206
12207   in0 = d->op0;
12208   in1 = d->op1;
12209   if (BYTES_BIG_ENDIAN)
12210     {
12211       x = in0, in0 = in1, in1 = x;
12212       high = !high;
12213     }
12214   out = d->target;
12215
12216   if (high)
12217     {
12218       switch (vmode)
12219         {
12220         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12221         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12222         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12223         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12224         case V4SImode: gen = gen_aarch64_zip2v4si; break;
12225         case V2SImode: gen = gen_aarch64_zip2v2si; break;
12226         case V2DImode: gen = gen_aarch64_zip2v2di; break;
12227         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12228         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12229         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12230         default:
12231           return false;
12232         }
12233     }
12234   else
12235     {
12236       switch (vmode)
12237         {
12238         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12239         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12240         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12241         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12242         case V4SImode: gen = gen_aarch64_zip1v4si; break;
12243         case V2SImode: gen = gen_aarch64_zip1v2si; break;
12244         case V2DImode: gen = gen_aarch64_zip1v2di; break;
12245         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12246         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12247         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12248         default:
12249           return false;
12250         }
12251     }
12252
12253   emit_insn (gen (out, in0, in1));
12254   return true;
12255 }
12256
12257 /* Recognize patterns for the EXT insn.  */
12258
12259 static bool
12260 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12261 {
12262   unsigned int i, nelt = d->nelt;
12263   rtx (*gen) (rtx, rtx, rtx, rtx);
12264   rtx offset;
12265
12266   unsigned int location = d->perm[0]; /* Always < nelt.  */
12267
12268   /* Check if the extracted indices are increasing by one.  */
12269   for (i = 1; i < nelt; i++)
12270     {
12271       unsigned int required = location + i;
12272       if (d->one_vector_p)
12273         {
12274           /* We'll pass the same vector in twice, so allow indices to wrap.  */
12275           required &= (nelt - 1);
12276         }
12277       if (d->perm[i] != required)
12278         return false;
12279     }
12280
12281   switch (d->vmode)
12282     {
12283     case V16QImode: gen = gen_aarch64_extv16qi; break;
12284     case V8QImode: gen = gen_aarch64_extv8qi; break;
12285     case V4HImode: gen = gen_aarch64_extv4hi; break;
12286     case V8HImode: gen = gen_aarch64_extv8hi; break;
12287     case V2SImode: gen = gen_aarch64_extv2si; break;
12288     case V4SImode: gen = gen_aarch64_extv4si; break;
12289     case V2SFmode: gen = gen_aarch64_extv2sf; break;
12290     case V4SFmode: gen = gen_aarch64_extv4sf; break;
12291     case V2DImode: gen = gen_aarch64_extv2di; break;
12292     case V2DFmode: gen = gen_aarch64_extv2df; break;
12293     default:
12294       return false;
12295     }
12296
12297   /* Success! */
12298   if (d->testing_p)
12299     return true;
12300
12301   /* The case where (location == 0) is a no-op for both big- and little-endian,
12302      and is removed by the mid-end at optimization levels -O1 and higher.  */
12303
12304   if (BYTES_BIG_ENDIAN && (location != 0))
12305     {
12306       /* After setup, we want the high elements of the first vector (stored
12307          at the LSB end of the register), and the low elements of the second
12308          vector (stored at the MSB end of the register). So swap.  */
12309       std::swap (d->op0, d->op1);
12310       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
12311       location = nelt - location;
12312     }
12313
12314   offset = GEN_INT (location);
12315   emit_insn (gen (d->target, d->op0, d->op1, offset));
12316   return true;
12317 }
12318
12319 /* Recognize patterns for the REV insns.  */
12320
12321 static bool
12322 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12323 {
12324   unsigned int i, j, diff, nelt = d->nelt;
12325   rtx (*gen) (rtx, rtx);
12326
12327   if (!d->one_vector_p)
12328     return false;
12329
12330   diff = d->perm[0];
12331   switch (diff)
12332     {
12333     case 7:
12334       switch (d->vmode)
12335         {
12336         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12337         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
12338         default:
12339           return false;
12340         }
12341       break;
12342     case 3:
12343       switch (d->vmode)
12344         {
12345         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
12346         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
12347         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
12348         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
12349         default:
12350           return false;
12351         }
12352       break;
12353     case 1:
12354       switch (d->vmode)
12355         {
12356         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
12357         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
12358         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
12359         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
12360         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
12361         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
12362         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
12363         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
12364         default:
12365           return false;
12366         }
12367       break;
12368     default:
12369       return false;
12370     }
12371
12372   for (i = 0; i < nelt ; i += diff + 1)
12373     for (j = 0; j <= diff; j += 1)
12374       {
12375         /* This is guaranteed to be true as the value of diff
12376            is 7, 3, 1 and we should have enough elements in the
12377            queue to generate this.  Getting a vector mask with a
12378            value of diff other than these values implies that
12379            something is wrong by the time we get here.  */
12380         gcc_assert (i + j < nelt);
12381         if (d->perm[i + j] != i + diff - j)
12382           return false;
12383       }
12384
12385   /* Success! */
12386   if (d->testing_p)
12387     return true;
12388
12389   emit_insn (gen (d->target, d->op0));
12390   return true;
12391 }
12392
12393 static bool
12394 aarch64_evpc_dup (struct expand_vec_perm_d *d)
12395 {
12396   rtx (*gen) (rtx, rtx, rtx);
12397   rtx out = d->target;
12398   rtx in0;
12399   machine_mode vmode = d->vmode;
12400   unsigned int i, elt, nelt = d->nelt;
12401   rtx lane;
12402
12403   elt = d->perm[0];
12404   for (i = 1; i < nelt; i++)
12405     {
12406       if (elt != d->perm[i])
12407         return false;
12408     }
12409
12410   /* The generic preparation in aarch64_expand_vec_perm_const_1
12411      swaps the operand order and the permute indices if it finds
12412      d->perm[0] to be in the second operand.  Thus, we can always
12413      use d->op0 and need not do any extra arithmetic to get the
12414      correct lane number.  */
12415   in0 = d->op0;
12416   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
12417
12418   switch (vmode)
12419     {
12420     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
12421     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
12422     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
12423     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
12424     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
12425     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
12426     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
12427     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
12428     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
12429     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
12430     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
12431     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
12432     default:
12433       return false;
12434     }
12435
12436   emit_insn (gen (out, in0, lane));
12437   return true;
12438 }
12439
12440 static bool
12441 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
12442 {
12443   rtx rperm[MAX_VECT_LEN], sel;
12444   machine_mode vmode = d->vmode;
12445   unsigned int i, nelt = d->nelt;
12446
12447   if (d->testing_p)
12448     return true;
12449
12450   /* Generic code will try constant permutation twice.  Once with the
12451      original mode and again with the elements lowered to QImode.
12452      So wait and don't do the selector expansion ourselves.  */
12453   if (vmode != V8QImode && vmode != V16QImode)
12454     return false;
12455
12456   for (i = 0; i < nelt; ++i)
12457     {
12458       int nunits = GET_MODE_NUNITS (vmode);
12459
12460       /* If big-endian and two vectors we end up with a weird mixed-endian
12461          mode on NEON.  Reverse the index within each word but not the word
12462          itself.  */
12463       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
12464                                            : d->perm[i]);
12465     }
12466   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
12467   sel = force_reg (vmode, sel);
12468
12469   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
12470   return true;
12471 }
12472
12473 static bool
12474 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
12475 {
12476   /* The pattern matching functions above are written to look for a small
12477      number to begin the sequence (0, 1, N/2).  If we begin with an index
12478      from the second operand, we can swap the operands.  */
12479   if (d->perm[0] >= d->nelt)
12480     {
12481       unsigned i, nelt = d->nelt;
12482
12483       gcc_assert (nelt == (nelt & -nelt));
12484       for (i = 0; i < nelt; ++i)
12485         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
12486
12487       std::swap (d->op0, d->op1);
12488     }
12489
12490   if (TARGET_SIMD)
12491     {
12492       if (aarch64_evpc_rev (d))
12493         return true;
12494       else if (aarch64_evpc_ext (d))
12495         return true;
12496       else if (aarch64_evpc_dup (d))
12497         return true;
12498       else if (aarch64_evpc_zip (d))
12499         return true;
12500       else if (aarch64_evpc_uzp (d))
12501         return true;
12502       else if (aarch64_evpc_trn (d))
12503         return true;
12504       return aarch64_evpc_tbl (d);
12505     }
12506   return false;
12507 }
12508
12509 /* Expand a vec_perm_const pattern.  */
12510
12511 bool
12512 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
12513 {
12514   struct expand_vec_perm_d d;
12515   int i, nelt, which;
12516
12517   d.target = target;
12518   d.op0 = op0;
12519   d.op1 = op1;
12520
12521   d.vmode = GET_MODE (target);
12522   gcc_assert (VECTOR_MODE_P (d.vmode));
12523   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12524   d.testing_p = false;
12525
12526   for (i = which = 0; i < nelt; ++i)
12527     {
12528       rtx e = XVECEXP (sel, 0, i);
12529       int ei = INTVAL (e) & (2 * nelt - 1);
12530       which |= (ei < nelt ? 1 : 2);
12531       d.perm[i] = ei;
12532     }
12533
12534   switch (which)
12535     {
12536     default:
12537       gcc_unreachable ();
12538
12539     case 3:
12540       d.one_vector_p = false;
12541       if (!rtx_equal_p (op0, op1))
12542         break;
12543
12544       /* The elements of PERM do not suggest that only the first operand
12545          is used, but both operands are identical.  Allow easier matching
12546          of the permutation by folding the permutation into the single
12547          input vector.  */
12548       /* Fall Through.  */
12549     case 2:
12550       for (i = 0; i < nelt; ++i)
12551         d.perm[i] &= nelt - 1;
12552       d.op0 = op1;
12553       d.one_vector_p = true;
12554       break;
12555
12556     case 1:
12557       d.op1 = op0;
12558       d.one_vector_p = true;
12559       break;
12560     }
12561
12562   return aarch64_expand_vec_perm_const_1 (&d);
12563 }
12564
12565 static bool
12566 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
12567                                      const unsigned char *sel)
12568 {
12569   struct expand_vec_perm_d d;
12570   unsigned int i, nelt, which;
12571   bool ret;
12572
12573   d.vmode = vmode;
12574   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12575   d.testing_p = true;
12576   memcpy (d.perm, sel, nelt);
12577
12578   /* Calculate whether all elements are in one vector.  */
12579   for (i = which = 0; i < nelt; ++i)
12580     {
12581       unsigned char e = d.perm[i];
12582       gcc_assert (e < 2 * nelt);
12583       which |= (e < nelt ? 1 : 2);
12584     }
12585
12586   /* If all elements are from the second vector, reindex as if from the
12587      first vector.  */
12588   if (which == 2)
12589     for (i = 0; i < nelt; ++i)
12590       d.perm[i] -= nelt;
12591
12592   /* Check whether the mask can be applied to a single vector.  */
12593   d.one_vector_p = (which != 3);
12594
12595   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
12596   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
12597   if (!d.one_vector_p)
12598     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
12599
12600   start_sequence ();
12601   ret = aarch64_expand_vec_perm_const_1 (&d);
12602   end_sequence ();
12603
12604   return ret;
12605 }
12606
12607 rtx
12608 aarch64_reverse_mask (enum machine_mode mode)
12609 {
12610   /* We have to reverse each vector because we dont have
12611      a permuted load that can reverse-load according to ABI rules.  */
12612   rtx mask;
12613   rtvec v = rtvec_alloc (16);
12614   int i, j;
12615   int nunits = GET_MODE_NUNITS (mode);
12616   int usize = GET_MODE_UNIT_SIZE (mode);
12617
12618   gcc_assert (BYTES_BIG_ENDIAN);
12619   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
12620
12621   for (i = 0; i < nunits; i++)
12622     for (j = 0; j < usize; j++)
12623       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
12624   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
12625   return force_reg (V16QImode, mask);
12626 }
12627
12628 /* Implement MODES_TIEABLE_P.  */
12629
12630 bool
12631 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
12632 {
12633   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
12634     return true;
12635
12636   /* We specifically want to allow elements of "structure" modes to
12637      be tieable to the structure.  This more general condition allows
12638      other rarer situations too.  */
12639   if (TARGET_SIMD
12640       && aarch64_vector_mode_p (mode1)
12641       && aarch64_vector_mode_p (mode2))
12642     return true;
12643
12644   return false;
12645 }
12646
12647 /* Return a new RTX holding the result of moving POINTER forward by
12648    AMOUNT bytes.  */
12649
12650 static rtx
12651 aarch64_move_pointer (rtx pointer, int amount)
12652 {
12653   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
12654
12655   return adjust_automodify_address (pointer, GET_MODE (pointer),
12656                                     next, amount);
12657 }
12658
12659 /* Return a new RTX holding the result of moving POINTER forward by the
12660    size of the mode it points to.  */
12661
12662 static rtx
12663 aarch64_progress_pointer (rtx pointer)
12664 {
12665   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
12666
12667   return aarch64_move_pointer (pointer, amount);
12668 }
12669
12670 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
12671    MODE bytes.  */
12672
12673 static void
12674 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
12675                                               machine_mode mode)
12676 {
12677   rtx reg = gen_reg_rtx (mode);
12678
12679   /* "Cast" the pointers to the correct mode.  */
12680   *src = adjust_address (*src, mode, 0);
12681   *dst = adjust_address (*dst, mode, 0);
12682   /* Emit the memcpy.  */
12683   emit_move_insn (reg, *src);
12684   emit_move_insn (*dst, reg);
12685   /* Move the pointers forward.  */
12686   *src = aarch64_progress_pointer (*src);
12687   *dst = aarch64_progress_pointer (*dst);
12688 }
12689
12690 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
12691    we succeed, otherwise return false.  */
12692
12693 bool
12694 aarch64_expand_movmem (rtx *operands)
12695 {
12696   unsigned int n;
12697   rtx dst = operands[0];
12698   rtx src = operands[1];
12699   rtx base;
12700   bool speed_p = !optimize_function_for_size_p (cfun);
12701
12702   /* When optimizing for size, give a better estimate of the length of a
12703      memcpy call, but use the default otherwise.  */
12704   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
12705
12706   /* We can't do anything smart if the amount to copy is not constant.  */
12707   if (!CONST_INT_P (operands[2]))
12708     return false;
12709
12710   n = UINTVAL (operands[2]);
12711
12712   /* Try to keep the number of instructions low.  For cases below 16 bytes we
12713      need to make at most two moves.  For cases above 16 bytes it will be one
12714      move for each 16 byte chunk, then at most two additional moves.  */
12715   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
12716     return false;
12717
12718   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
12719   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
12720
12721   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
12722   src = adjust_automodify_address (src, VOIDmode, base, 0);
12723
12724   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
12725      1-byte chunk.  */
12726   if (n < 4)
12727     {
12728       if (n >= 2)
12729         {
12730           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12731           n -= 2;
12732         }
12733
12734       if (n == 1)
12735         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
12736
12737       return true;
12738     }
12739
12740   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
12741      4-byte chunk, partially overlapping with the previously copied chunk.  */
12742   if (n < 8)
12743     {
12744       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12745       n -= 4;
12746       if (n > 0)
12747         {
12748           int move = n - 4;
12749
12750           src = aarch64_move_pointer (src, move);
12751           dst = aarch64_move_pointer (dst, move);
12752           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12753         }
12754       return true;
12755     }
12756
12757   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
12758      them, then (if applicable) an 8-byte chunk.  */
12759   while (n >= 8)
12760     {
12761       if (n / 16)
12762         {
12763           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
12764           n -= 16;
12765         }
12766       else
12767         {
12768           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
12769           n -= 8;
12770         }
12771     }
12772
12773   /* Finish the final bytes of the copy.  We can always do this in one
12774      instruction.  We either copy the exact amount we need, or partially
12775      overlap with the previous chunk we copied and copy 8-bytes.  */
12776   if (n == 0)
12777     return true;
12778   else if (n == 1)
12779     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
12780   else if (n == 2)
12781     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12782   else if (n == 4)
12783     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12784   else
12785     {
12786       if (n == 3)
12787         {
12788           src = aarch64_move_pointer (src, -1);
12789           dst = aarch64_move_pointer (dst, -1);
12790           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12791         }
12792       else
12793         {
12794           int move = n - 8;
12795
12796           src = aarch64_move_pointer (src, move);
12797           dst = aarch64_move_pointer (dst, move);
12798           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
12799         }
12800     }
12801
12802   return true;
12803 }
12804
12805 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
12806
12807 static unsigned HOST_WIDE_INT
12808 aarch64_asan_shadow_offset (void)
12809 {
12810   return (HOST_WIDE_INT_1 << 36);
12811 }
12812
12813 static bool
12814 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
12815                                         unsigned int align,
12816                                         enum by_pieces_operation op,
12817                                         bool speed_p)
12818 {
12819   /* STORE_BY_PIECES can be used when copying a constant string, but
12820      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
12821      For now we always fail this and let the move_by_pieces code copy
12822      the string from read-only memory.  */
12823   if (op == STORE_BY_PIECES)
12824     return false;
12825
12826   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
12827 }
12828
12829 static rtx
12830 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
12831                         int code, tree treeop0, tree treeop1)
12832 {
12833   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
12834   rtx op0, op1;
12835   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
12836   insn_code icode;
12837   struct expand_operand ops[4];
12838
12839   start_sequence ();
12840   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
12841
12842   op_mode = GET_MODE (op0);
12843   if (op_mode == VOIDmode)
12844     op_mode = GET_MODE (op1);
12845
12846   switch (op_mode)
12847     {
12848     case QImode:
12849     case HImode:
12850     case SImode:
12851       cmp_mode = SImode;
12852       icode = CODE_FOR_cmpsi;
12853       break;
12854
12855     case DImode:
12856       cmp_mode = DImode;
12857       icode = CODE_FOR_cmpdi;
12858       break;
12859
12860     case SFmode:
12861       cmp_mode = SFmode;
12862       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
12863       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
12864       break;
12865
12866     case DFmode:
12867       cmp_mode = DFmode;
12868       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
12869       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
12870       break;
12871
12872     default:
12873       end_sequence ();
12874       return NULL_RTX;
12875     }
12876
12877   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
12878   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
12879   if (!op0 || !op1)
12880     {
12881       end_sequence ();
12882       return NULL_RTX;
12883     }
12884   *prep_seq = get_insns ();
12885   end_sequence ();
12886
12887   create_fixed_operand (&ops[0], op0);
12888   create_fixed_operand (&ops[1], op1);
12889
12890   start_sequence ();
12891   if (!maybe_expand_insn (icode, 2, ops))
12892     {
12893       end_sequence ();
12894       return NULL_RTX;
12895     }
12896   *gen_seq = get_insns ();
12897   end_sequence ();
12898
12899   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
12900                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
12901 }
12902
12903 static rtx
12904 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
12905                        tree treeop0, tree treeop1, int bit_code)
12906 {
12907   rtx op0, op1, target;
12908   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
12909   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
12910   insn_code icode;
12911   struct expand_operand ops[6];
12912   int aarch64_cond;
12913
12914   push_to_sequence ((rtx_insn*) *prep_seq);
12915   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
12916
12917   op_mode = GET_MODE (op0);
12918   if (op_mode == VOIDmode)
12919     op_mode = GET_MODE (op1);
12920
12921   switch (op_mode)
12922     {
12923     case QImode:
12924     case HImode:
12925     case SImode:
12926       cmp_mode = SImode;
12927       icode = CODE_FOR_ccmpsi;
12928       break;
12929
12930     case DImode:
12931       cmp_mode = DImode;
12932       icode = CODE_FOR_ccmpdi;
12933       break;
12934
12935     case SFmode:
12936       cmp_mode = SFmode;
12937       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
12938       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
12939       break;
12940
12941     case DFmode:
12942       cmp_mode = DFmode;
12943       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
12944       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
12945       break;
12946
12947     default:
12948       end_sequence ();
12949       return NULL_RTX;
12950     }
12951
12952   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
12953   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
12954   if (!op0 || !op1)
12955     {
12956       end_sequence ();
12957       return NULL_RTX;
12958     }
12959   *prep_seq = get_insns ();
12960   end_sequence ();
12961
12962   target = gen_rtx_REG (cc_mode, CC_REGNUM);
12963   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
12964
12965   if (bit_code != AND)
12966     {
12967       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
12968                                                 GET_MODE (XEXP (prev, 0))),
12969                              VOIDmode, XEXP (prev, 0), const0_rtx);
12970       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
12971     }
12972
12973   create_fixed_operand (&ops[0], XEXP (prev, 0));
12974   create_fixed_operand (&ops[1], target);
12975   create_fixed_operand (&ops[2], op0);
12976   create_fixed_operand (&ops[3], op1);
12977   create_fixed_operand (&ops[4], prev);
12978   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
12979
12980   push_to_sequence ((rtx_insn*) *gen_seq);
12981   if (!maybe_expand_insn (icode, 6, ops))
12982     {
12983       end_sequence ();
12984       return NULL_RTX;
12985     }
12986
12987   *gen_seq = get_insns ();
12988   end_sequence ();
12989
12990   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
12991 }
12992
12993 #undef TARGET_GEN_CCMP_FIRST
12994 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
12995
12996 #undef TARGET_GEN_CCMP_NEXT
12997 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
12998
12999 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
13000    instruction fusion of some sort.  */
13001
13002 static bool
13003 aarch64_macro_fusion_p (void)
13004 {
13005   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13006 }
13007
13008
13009 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
13010    should be kept together during scheduling.  */
13011
13012 static bool
13013 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13014 {
13015   rtx set_dest;
13016   rtx prev_set = single_set (prev);
13017   rtx curr_set = single_set (curr);
13018   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
13019   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13020
13021   if (!aarch64_macro_fusion_p ())
13022     return false;
13023
13024   if (simple_sets_p
13025       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOV_MOVK))
13026     {
13027       /* We are trying to match:
13028          prev (mov)  == (set (reg r0) (const_int imm16))
13029          curr (movk) == (set (zero_extract (reg r0)
13030                                            (const_int 16)
13031                                            (const_int 16))
13032                              (const_int imm16_1))  */
13033
13034       set_dest = SET_DEST (curr_set);
13035
13036       if (GET_CODE (set_dest) == ZERO_EXTRACT
13037           && CONST_INT_P (SET_SRC (curr_set))
13038           && CONST_INT_P (SET_SRC (prev_set))
13039           && CONST_INT_P (XEXP (set_dest, 2))
13040           && INTVAL (XEXP (set_dest, 2)) == 16
13041           && REG_P (XEXP (set_dest, 0))
13042           && REG_P (SET_DEST (prev_set))
13043           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13044         {
13045           return true;
13046         }
13047     }
13048
13049   if (simple_sets_p
13050       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_ADD))
13051     {
13052
13053       /*  We're trying to match:
13054           prev (adrp) == (set (reg r1)
13055                               (high (symbol_ref ("SYM"))))
13056           curr (add) == (set (reg r0)
13057                              (lo_sum (reg r1)
13058                                      (symbol_ref ("SYM"))))
13059           Note that r0 need not necessarily be the same as r1, especially
13060           during pre-regalloc scheduling.  */
13061
13062       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13063           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13064         {
13065           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13066               && REG_P (XEXP (SET_SRC (curr_set), 0))
13067               && REGNO (XEXP (SET_SRC (curr_set), 0))
13068                  == REGNO (SET_DEST (prev_set))
13069               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13070                               XEXP (SET_SRC (curr_set), 1)))
13071             return true;
13072         }
13073     }
13074
13075   if (simple_sets_p
13076       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOVK_MOVK))
13077     {
13078
13079       /* We're trying to match:
13080          prev (movk) == (set (zero_extract (reg r0)
13081                                            (const_int 16)
13082                                            (const_int 32))
13083                              (const_int imm16_1))
13084          curr (movk) == (set (zero_extract (reg r0)
13085                                            (const_int 16)
13086                                            (const_int 48))
13087                              (const_int imm16_2))  */
13088
13089       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13090           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13091           && REG_P (XEXP (SET_DEST (prev_set), 0))
13092           && REG_P (XEXP (SET_DEST (curr_set), 0))
13093           && REGNO (XEXP (SET_DEST (prev_set), 0))
13094              == REGNO (XEXP (SET_DEST (curr_set), 0))
13095           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13096           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13097           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13098           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13099           && CONST_INT_P (SET_SRC (prev_set))
13100           && CONST_INT_P (SET_SRC (curr_set)))
13101         return true;
13102
13103     }
13104   if (simple_sets_p
13105       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_LDR))
13106     {
13107       /* We're trying to match:
13108           prev (adrp) == (set (reg r0)
13109                               (high (symbol_ref ("SYM"))))
13110           curr (ldr) == (set (reg r1)
13111                              (mem (lo_sum (reg r0)
13112                                              (symbol_ref ("SYM")))))
13113                  or
13114           curr (ldr) == (set (reg r1)
13115                              (zero_extend (mem
13116                                            (lo_sum (reg r0)
13117                                                    (symbol_ref ("SYM"))))))  */
13118       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13119           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13120         {
13121           rtx curr_src = SET_SRC (curr_set);
13122
13123           if (GET_CODE (curr_src) == ZERO_EXTEND)
13124             curr_src = XEXP (curr_src, 0);
13125
13126           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13127               && REG_P (XEXP (XEXP (curr_src, 0), 0))
13128               && REGNO (XEXP (XEXP (curr_src, 0), 0))
13129                  == REGNO (SET_DEST (prev_set))
13130               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13131                               XEXP (SET_SRC (prev_set), 0)))
13132               return true;
13133         }
13134     }
13135
13136   if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_AES_AESMC)
13137        && aarch_crypto_can_dual_issue (prev, curr))
13138     return true;
13139
13140   if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
13141       && any_condjump_p (curr))
13142     {
13143       enum attr_type prev_type = get_attr_type (prev);
13144
13145       /* FIXME: this misses some which is considered simple arthematic
13146          instructions for ThunderX.  Simple shifts are missed here.  */
13147       if (prev_type == TYPE_ALUS_SREG
13148           || prev_type == TYPE_ALUS_IMM
13149           || prev_type == TYPE_LOGICS_REG
13150           || prev_type == TYPE_LOGICS_IMM)
13151         return true;
13152     }
13153
13154   return false;
13155 }
13156
13157 /* Return true iff the instruction fusion described by OP is enabled.  */
13158
13159 bool
13160 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13161 {
13162   return (aarch64_tune_params.fusible_ops & op) != 0;
13163 }
13164
13165 /* If MEM is in the form of [base+offset], extract the two parts
13166    of address and set to BASE and OFFSET, otherwise return false
13167    after clearing BASE and OFFSET.  */
13168
13169 bool
13170 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13171 {
13172   rtx addr;
13173
13174   gcc_assert (MEM_P (mem));
13175
13176   addr = XEXP (mem, 0);
13177
13178   if (REG_P (addr))
13179     {
13180       *base = addr;
13181       *offset = const0_rtx;
13182       return true;
13183     }
13184
13185   if (GET_CODE (addr) == PLUS
13186       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13187     {
13188       *base = XEXP (addr, 0);
13189       *offset = XEXP (addr, 1);
13190       return true;
13191     }
13192
13193   *base = NULL_RTX;
13194   *offset = NULL_RTX;
13195
13196   return false;
13197 }
13198
13199 /* Types for scheduling fusion.  */
13200 enum sched_fusion_type
13201 {
13202   SCHED_FUSION_NONE = 0,
13203   SCHED_FUSION_LD_SIGN_EXTEND,
13204   SCHED_FUSION_LD_ZERO_EXTEND,
13205   SCHED_FUSION_LD,
13206   SCHED_FUSION_ST,
13207   SCHED_FUSION_NUM
13208 };
13209
13210 /* If INSN is a load or store of address in the form of [base+offset],
13211    extract the two parts and set to BASE and OFFSET.  Return scheduling
13212    fusion type this INSN is.  */
13213
13214 static enum sched_fusion_type
13215 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13216 {
13217   rtx x, dest, src;
13218   enum sched_fusion_type fusion = SCHED_FUSION_LD;
13219
13220   gcc_assert (INSN_P (insn));
13221   x = PATTERN (insn);
13222   if (GET_CODE (x) != SET)
13223     return SCHED_FUSION_NONE;
13224
13225   src = SET_SRC (x);
13226   dest = SET_DEST (x);
13227
13228   machine_mode dest_mode = GET_MODE (dest);
13229
13230   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13231     return SCHED_FUSION_NONE;
13232
13233   if (GET_CODE (src) == SIGN_EXTEND)
13234     {
13235       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
13236       src = XEXP (src, 0);
13237       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13238         return SCHED_FUSION_NONE;
13239     }
13240   else if (GET_CODE (src) == ZERO_EXTEND)
13241     {
13242       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13243       src = XEXP (src, 0);
13244       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13245         return SCHED_FUSION_NONE;
13246     }
13247
13248   if (GET_CODE (src) == MEM && REG_P (dest))
13249     extract_base_offset_in_addr (src, base, offset);
13250   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13251     {
13252       fusion = SCHED_FUSION_ST;
13253       extract_base_offset_in_addr (dest, base, offset);
13254     }
13255   else
13256     return SCHED_FUSION_NONE;
13257
13258   if (*base == NULL_RTX || *offset == NULL_RTX)
13259     fusion = SCHED_FUSION_NONE;
13260
13261   return fusion;
13262 }
13263
13264 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13265
13266    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13267    and PRI are only calculated for these instructions.  For other instruction,
13268    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
13269    type instruction fusion can be added by returning different priorities.
13270
13271    It's important that irrelevant instructions get the largest FUSION_PRI.  */
13272
13273 static void
13274 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13275                                int *fusion_pri, int *pri)
13276 {
13277   int tmp, off_val;
13278   rtx base, offset;
13279   enum sched_fusion_type fusion;
13280
13281   gcc_assert (INSN_P (insn));
13282
13283   tmp = max_pri - 1;
13284   fusion = fusion_load_store (insn, &base, &offset);
13285   if (fusion == SCHED_FUSION_NONE)
13286     {
13287       *pri = tmp;
13288       *fusion_pri = tmp;
13289       return;
13290     }
13291
13292   /* Set FUSION_PRI according to fusion type and base register.  */
13293   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
13294
13295   /* Calculate PRI.  */
13296   tmp /= 2;
13297
13298   /* INSN with smaller offset goes first.  */
13299   off_val = (int)(INTVAL (offset));
13300   if (off_val >= 0)
13301     tmp -= (off_val & 0xfffff);
13302   else
13303     tmp += ((- off_val) & 0xfffff);
13304
13305   *pri = tmp;
13306   return;
13307 }
13308
13309 /* Given OPERANDS of consecutive load/store, check if we can merge
13310    them into ldp/stp.  LOAD is true if they are load instructions.
13311    MODE is the mode of memory operands.  */
13312
13313 bool
13314 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
13315                                 enum machine_mode mode)
13316 {
13317   HOST_WIDE_INT offval_1, offval_2, msize;
13318   enum reg_class rclass_1, rclass_2;
13319   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
13320
13321   if (load)
13322     {
13323       mem_1 = operands[1];
13324       mem_2 = operands[3];
13325       reg_1 = operands[0];
13326       reg_2 = operands[2];
13327       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
13328       if (REGNO (reg_1) == REGNO (reg_2))
13329         return false;
13330     }
13331   else
13332     {
13333       mem_1 = operands[0];
13334       mem_2 = operands[2];
13335       reg_1 = operands[1];
13336       reg_2 = operands[3];
13337     }
13338
13339   /* The mems cannot be volatile.  */
13340   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
13341     return false;
13342
13343   /* Check if the addresses are in the form of [base+offset].  */
13344   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13345   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13346     return false;
13347   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13348   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13349     return false;
13350
13351   /* Check if the bases are same.  */
13352   if (!rtx_equal_p (base_1, base_2))
13353     return false;
13354
13355   offval_1 = INTVAL (offset_1);
13356   offval_2 = INTVAL (offset_2);
13357   msize = GET_MODE_SIZE (mode);
13358   /* Check if the offsets are consecutive.  */
13359   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
13360     return false;
13361
13362   /* Check if the addresses are clobbered by load.  */
13363   if (load)
13364     {
13365       if (reg_mentioned_p (reg_1, mem_1))
13366         return false;
13367
13368       /* In increasing order, the last load can clobber the address.  */
13369       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
13370       return false;
13371     }
13372
13373   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13374     rclass_1 = FP_REGS;
13375   else
13376     rclass_1 = GENERAL_REGS;
13377
13378   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13379     rclass_2 = FP_REGS;
13380   else
13381     rclass_2 = GENERAL_REGS;
13382
13383   /* Check if the registers are of same class.  */
13384   if (rclass_1 != rclass_2)
13385     return false;
13386
13387   return true;
13388 }
13389
13390 /* Given OPERANDS of consecutive load/store, check if we can merge
13391    them into ldp/stp by adjusting the offset.  LOAD is true if they
13392    are load instructions.  MODE is the mode of memory operands.
13393
13394    Given below consecutive stores:
13395
13396      str  w1, [xb, 0x100]
13397      str  w1, [xb, 0x104]
13398      str  w1, [xb, 0x108]
13399      str  w1, [xb, 0x10c]
13400
13401    Though the offsets are out of the range supported by stp, we can
13402    still pair them after adjusting the offset, like:
13403
13404      add  scratch, xb, 0x100
13405      stp  w1, w1, [scratch]
13406      stp  w1, w1, [scratch, 0x8]
13407
13408    The peephole patterns detecting this opportunity should guarantee
13409    the scratch register is avaliable.  */
13410
13411 bool
13412 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
13413                                        enum machine_mode mode)
13414 {
13415   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
13416   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
13417   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
13418   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
13419
13420   if (load)
13421     {
13422       reg_1 = operands[0];
13423       mem_1 = operands[1];
13424       reg_2 = operands[2];
13425       mem_2 = operands[3];
13426       reg_3 = operands[4];
13427       mem_3 = operands[5];
13428       reg_4 = operands[6];
13429       mem_4 = operands[7];
13430       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
13431                   && REG_P (reg_3) && REG_P (reg_4));
13432       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
13433         return false;
13434     }
13435   else
13436     {
13437       mem_1 = operands[0];
13438       reg_1 = operands[1];
13439       mem_2 = operands[2];
13440       reg_2 = operands[3];
13441       mem_3 = operands[4];
13442       reg_3 = operands[5];
13443       mem_4 = operands[6];
13444       reg_4 = operands[7];
13445     }
13446   /* Skip if memory operand is by itslef valid for ldp/stp.  */
13447   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
13448     return false;
13449
13450   /* The mems cannot be volatile.  */
13451   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
13452       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
13453     return false;
13454
13455   /* Check if the addresses are in the form of [base+offset].  */
13456   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13457   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13458     return false;
13459   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13460   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13461     return false;
13462   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
13463   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
13464     return false;
13465   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
13466   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
13467     return false;
13468
13469   /* Check if the bases are same.  */
13470   if (!rtx_equal_p (base_1, base_2)
13471       || !rtx_equal_p (base_2, base_3)
13472       || !rtx_equal_p (base_3, base_4))
13473     return false;
13474
13475   offval_1 = INTVAL (offset_1);
13476   offval_2 = INTVAL (offset_2);
13477   offval_3 = INTVAL (offset_3);
13478   offval_4 = INTVAL (offset_4);
13479   msize = GET_MODE_SIZE (mode);
13480   /* Check if the offsets are consecutive.  */
13481   if ((offval_1 != (offval_2 + msize)
13482        || offval_1 != (offval_3 + msize * 2)
13483        || offval_1 != (offval_4 + msize * 3))
13484       && (offval_4 != (offval_3 + msize)
13485           || offval_4 != (offval_2 + msize * 2)
13486           || offval_4 != (offval_1 + msize * 3)))
13487     return false;
13488
13489   /* Check if the addresses are clobbered by load.  */
13490   if (load)
13491     {
13492       if (reg_mentioned_p (reg_1, mem_1)
13493           || reg_mentioned_p (reg_2, mem_2)
13494           || reg_mentioned_p (reg_3, mem_3))
13495         return false;
13496
13497       /* In increasing order, the last load can clobber the address.  */
13498       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
13499         return false;
13500     }
13501
13502   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13503     rclass_1 = FP_REGS;
13504   else
13505     rclass_1 = GENERAL_REGS;
13506
13507   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13508     rclass_2 = FP_REGS;
13509   else
13510     rclass_2 = GENERAL_REGS;
13511
13512   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
13513     rclass_3 = FP_REGS;
13514   else
13515     rclass_3 = GENERAL_REGS;
13516
13517   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
13518     rclass_4 = FP_REGS;
13519   else
13520     rclass_4 = GENERAL_REGS;
13521
13522   /* Check if the registers are of same class.  */
13523   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
13524     return false;
13525
13526   return true;
13527 }
13528
13529 /* Given OPERANDS of consecutive load/store, this function pairs them
13530    into ldp/stp after adjusting the offset.  It depends on the fact
13531    that addresses of load/store instructions are in increasing order.
13532    MODE is the mode of memory operands.  CODE is the rtl operator
13533    which should be applied to all memory operands, it's SIGN_EXTEND,
13534    ZERO_EXTEND or UNKNOWN.  */
13535
13536 bool
13537 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
13538                              enum machine_mode mode, RTX_CODE code)
13539 {
13540   rtx base, offset, t1, t2;
13541   rtx mem_1, mem_2, mem_3, mem_4;
13542   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
13543
13544   if (load)
13545     {
13546       mem_1 = operands[1];
13547       mem_2 = operands[3];
13548       mem_3 = operands[5];
13549       mem_4 = operands[7];
13550     }
13551   else
13552     {
13553       mem_1 = operands[0];
13554       mem_2 = operands[2];
13555       mem_3 = operands[4];
13556       mem_4 = operands[6];
13557       gcc_assert (code == UNKNOWN);
13558     }
13559
13560   extract_base_offset_in_addr (mem_1, &base, &offset);
13561   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
13562
13563   /* Adjust offset thus it can fit in ldp/stp instruction.  */
13564   msize = GET_MODE_SIZE (mode);
13565   stp_off_limit = msize * 0x40;
13566   off_val = INTVAL (offset);
13567   abs_off = (off_val < 0) ? -off_val : off_val;
13568   new_off = abs_off % stp_off_limit;
13569   adj_off = abs_off - new_off;
13570
13571   /* Further adjust to make sure all offsets are OK.  */
13572   if ((new_off + msize * 2) >= stp_off_limit)
13573     {
13574       adj_off += stp_off_limit;
13575       new_off -= stp_off_limit;
13576     }
13577
13578   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
13579   if (adj_off >= 0x1000)
13580     return false;
13581
13582   if (off_val < 0)
13583     {
13584       adj_off = -adj_off;
13585       new_off = -new_off;
13586     }
13587
13588   /* Create new memory references.  */
13589   mem_1 = change_address (mem_1, VOIDmode,
13590                           plus_constant (DImode, operands[8], new_off));
13591
13592   /* Check if the adjusted address is OK for ldp/stp.  */
13593   if (!aarch64_mem_pair_operand (mem_1, mode))
13594     return false;
13595
13596   msize = GET_MODE_SIZE (mode);
13597   mem_2 = change_address (mem_2, VOIDmode,
13598                           plus_constant (DImode,
13599                                          operands[8],
13600                                          new_off + msize));
13601   mem_3 = change_address (mem_3, VOIDmode,
13602                           plus_constant (DImode,
13603                                          operands[8],
13604                                          new_off + msize * 2));
13605   mem_4 = change_address (mem_4, VOIDmode,
13606                           plus_constant (DImode,
13607                                          operands[8],
13608                                          new_off + msize * 3));
13609
13610   if (code == ZERO_EXTEND)
13611     {
13612       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
13613       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
13614       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
13615       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
13616     }
13617   else if (code == SIGN_EXTEND)
13618     {
13619       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
13620       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
13621       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
13622       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
13623     }
13624
13625   if (load)
13626     {
13627       operands[1] = mem_1;
13628       operands[3] = mem_2;
13629       operands[5] = mem_3;
13630       operands[7] = mem_4;
13631     }
13632   else
13633     {
13634       operands[0] = mem_1;
13635       operands[2] = mem_2;
13636       operands[4] = mem_3;
13637       operands[6] = mem_4;
13638     }
13639
13640   /* Emit adjusting instruction.  */
13641   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
13642   /* Emit ldp/stp instructions.  */
13643   t1 = gen_rtx_SET (operands[0], operands[1]);
13644   t2 = gen_rtx_SET (operands[2], operands[3]);
13645   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13646   t1 = gen_rtx_SET (operands[4], operands[5]);
13647   t2 = gen_rtx_SET (operands[6], operands[7]);
13648   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13649   return true;
13650 }
13651
13652 /* Return 1 if pseudo register should be created and used to hold
13653    GOT address for PIC code.  */
13654
13655 bool
13656 aarch64_use_pseudo_pic_reg (void)
13657 {
13658   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
13659 }
13660
13661 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
13662
13663 static int
13664 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
13665 {
13666   switch (XINT (x, 1))
13667     {
13668     case UNSPEC_GOTSMALLPIC:
13669     case UNSPEC_GOTSMALLPIC28K:
13670     case UNSPEC_GOTTINYPIC:
13671       return 0;
13672     default:
13673       break;
13674     }
13675
13676   return default_unspec_may_trap_p (x, flags);
13677 }
13678
13679
13680 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
13681    return the log2 of that value.  Otherwise return -1.  */
13682
13683 int
13684 aarch64_fpconst_pow_of_2 (rtx x)
13685 {
13686   const REAL_VALUE_TYPE *r;
13687
13688   if (!CONST_DOUBLE_P (x))
13689     return -1;
13690
13691   r = CONST_DOUBLE_REAL_VALUE (x);
13692
13693   if (REAL_VALUE_NEGATIVE (*r)
13694       || REAL_VALUE_ISNAN (*r)
13695       || REAL_VALUE_ISINF (*r)
13696       || !real_isinteger (r, DFmode))
13697     return -1;
13698
13699   return exact_log2 (real_to_integer (r));
13700 }
13701
13702 /* If X is a vector of equal CONST_DOUBLE values and that value is
13703    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
13704
13705 int
13706 aarch64_vec_fpconst_pow_of_2 (rtx x)
13707 {
13708   if (GET_CODE (x) != CONST_VECTOR)
13709     return -1;
13710
13711   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
13712     return -1;
13713
13714   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
13715   if (firstval <= 0)
13716     return -1;
13717
13718   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
13719     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
13720       return -1;
13721
13722   return firstval;
13723 }
13724
13725 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float.  */
13726 static tree
13727 aarch64_promoted_type (const_tree t)
13728 {
13729   if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
13730     return float_type_node;
13731   return NULL_TREE;
13732 }
13733
13734 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
13735
13736 static bool
13737 aarch64_optab_supported_p (int op, machine_mode, machine_mode,
13738                            optimization_type opt_type)
13739 {
13740   switch (op)
13741     {
13742     case rsqrt_optab:
13743       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
13744
13745     default:
13746       return true;
13747     }
13748 }
13749
13750 #undef TARGET_ADDRESS_COST
13751 #define TARGET_ADDRESS_COST aarch64_address_cost
13752
13753 /* This hook will determines whether unnamed bitfields affect the alignment
13754    of the containing structure.  The hook returns true if the structure
13755    should inherit the alignment requirements of an unnamed bitfield's
13756    type.  */
13757 #undef TARGET_ALIGN_ANON_BITFIELD
13758 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
13759
13760 #undef TARGET_ASM_ALIGNED_DI_OP
13761 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
13762
13763 #undef TARGET_ASM_ALIGNED_HI_OP
13764 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
13765
13766 #undef TARGET_ASM_ALIGNED_SI_OP
13767 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
13768
13769 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
13770 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
13771   hook_bool_const_tree_hwi_hwi_const_tree_true
13772
13773 #undef TARGET_ASM_FILE_START
13774 #define TARGET_ASM_FILE_START aarch64_start_file
13775
13776 #undef TARGET_ASM_OUTPUT_MI_THUNK
13777 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
13778
13779 #undef TARGET_ASM_SELECT_RTX_SECTION
13780 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
13781
13782 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
13783 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
13784
13785 #undef TARGET_BUILD_BUILTIN_VA_LIST
13786 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
13787
13788 #undef TARGET_CALLEE_COPIES
13789 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
13790
13791 #undef TARGET_CAN_ELIMINATE
13792 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
13793
13794 #undef TARGET_CAN_INLINE_P
13795 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
13796
13797 #undef TARGET_CANNOT_FORCE_CONST_MEM
13798 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
13799
13800 #undef TARGET_CASE_VALUES_THRESHOLD
13801 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
13802
13803 #undef TARGET_CONDITIONAL_REGISTER_USAGE
13804 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
13805
13806 /* Only the least significant bit is used for initialization guard
13807    variables.  */
13808 #undef TARGET_CXX_GUARD_MASK_BIT
13809 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
13810
13811 #undef TARGET_C_MODE_FOR_SUFFIX
13812 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
13813
13814 #ifdef TARGET_BIG_ENDIAN_DEFAULT
13815 #undef  TARGET_DEFAULT_TARGET_FLAGS
13816 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
13817 #endif
13818
13819 #undef TARGET_CLASS_MAX_NREGS
13820 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
13821
13822 #undef TARGET_BUILTIN_DECL
13823 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
13824
13825 #undef TARGET_BUILTIN_RECIPROCAL
13826 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
13827
13828 #undef  TARGET_EXPAND_BUILTIN
13829 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
13830
13831 #undef TARGET_EXPAND_BUILTIN_VA_START
13832 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
13833
13834 #undef TARGET_FOLD_BUILTIN
13835 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
13836
13837 #undef TARGET_FUNCTION_ARG
13838 #define TARGET_FUNCTION_ARG aarch64_function_arg
13839
13840 #undef TARGET_FUNCTION_ARG_ADVANCE
13841 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
13842
13843 #undef TARGET_FUNCTION_ARG_BOUNDARY
13844 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
13845
13846 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
13847 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
13848
13849 #undef TARGET_FUNCTION_VALUE
13850 #define TARGET_FUNCTION_VALUE aarch64_function_value
13851
13852 #undef TARGET_FUNCTION_VALUE_REGNO_P
13853 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
13854
13855 #undef TARGET_FRAME_POINTER_REQUIRED
13856 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
13857
13858 #undef TARGET_GIMPLE_FOLD_BUILTIN
13859 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
13860
13861 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
13862 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
13863
13864 #undef  TARGET_INIT_BUILTINS
13865 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
13866
13867 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
13868 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
13869   aarch64_ira_change_pseudo_allocno_class
13870
13871 #undef TARGET_LEGITIMATE_ADDRESS_P
13872 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
13873
13874 #undef TARGET_LEGITIMATE_CONSTANT_P
13875 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
13876
13877 #undef TARGET_LIBGCC_CMP_RETURN_MODE
13878 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
13879
13880 #undef TARGET_LRA_P
13881 #define TARGET_LRA_P hook_bool_void_true
13882
13883 #undef TARGET_MANGLE_TYPE
13884 #define TARGET_MANGLE_TYPE aarch64_mangle_type
13885
13886 #undef TARGET_MEMORY_MOVE_COST
13887 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
13888
13889 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
13890 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
13891
13892 #undef TARGET_MUST_PASS_IN_STACK
13893 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
13894
13895 /* This target hook should return true if accesses to volatile bitfields
13896    should use the narrowest mode possible.  It should return false if these
13897    accesses should use the bitfield container type.  */
13898 #undef TARGET_NARROW_VOLATILE_BITFIELD
13899 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
13900
13901 #undef  TARGET_OPTION_OVERRIDE
13902 #define TARGET_OPTION_OVERRIDE aarch64_override_options
13903
13904 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
13905 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
13906   aarch64_override_options_after_change
13907
13908 #undef TARGET_OPTION_SAVE
13909 #define TARGET_OPTION_SAVE aarch64_option_save
13910
13911 #undef TARGET_OPTION_RESTORE
13912 #define TARGET_OPTION_RESTORE aarch64_option_restore
13913
13914 #undef TARGET_OPTION_PRINT
13915 #define TARGET_OPTION_PRINT aarch64_option_print
13916
13917 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
13918 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
13919
13920 #undef TARGET_SET_CURRENT_FUNCTION
13921 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
13922
13923 #undef TARGET_PASS_BY_REFERENCE
13924 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
13925
13926 #undef TARGET_PREFERRED_RELOAD_CLASS
13927 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
13928
13929 #undef TARGET_SCHED_REASSOCIATION_WIDTH
13930 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
13931
13932 #undef TARGET_PROMOTED_TYPE
13933 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
13934
13935 #undef TARGET_SECONDARY_RELOAD
13936 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
13937
13938 #undef TARGET_SHIFT_TRUNCATION_MASK
13939 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
13940
13941 #undef TARGET_SETUP_INCOMING_VARARGS
13942 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
13943
13944 #undef TARGET_STRUCT_VALUE_RTX
13945 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
13946
13947 #undef TARGET_REGISTER_MOVE_COST
13948 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
13949
13950 #undef TARGET_RETURN_IN_MEMORY
13951 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
13952
13953 #undef TARGET_RETURN_IN_MSB
13954 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
13955
13956 #undef TARGET_RTX_COSTS
13957 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
13958
13959 #undef TARGET_SCHED_ISSUE_RATE
13960 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
13961
13962 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
13963 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
13964   aarch64_sched_first_cycle_multipass_dfa_lookahead
13965
13966 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
13967 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
13968   aarch64_first_cycle_multipass_dfa_lookahead_guard
13969
13970 #undef TARGET_TRAMPOLINE_INIT
13971 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
13972
13973 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
13974 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
13975
13976 #undef TARGET_VECTOR_MODE_SUPPORTED_P
13977 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
13978
13979 #undef TARGET_ARRAY_MODE_SUPPORTED_P
13980 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
13981
13982 #undef TARGET_VECTORIZE_ADD_STMT_COST
13983 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
13984
13985 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
13986 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
13987   aarch64_builtin_vectorization_cost
13988
13989 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
13990 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
13991
13992 #undef TARGET_VECTORIZE_BUILTINS
13993 #define TARGET_VECTORIZE_BUILTINS
13994
13995 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
13996 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
13997   aarch64_builtin_vectorized_function
13998
13999 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14000 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14001   aarch64_autovectorize_vector_sizes
14002
14003 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14004 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14005   aarch64_atomic_assign_expand_fenv
14006
14007 /* Section anchor support.  */
14008
14009 #undef TARGET_MIN_ANCHOR_OFFSET
14010 #define TARGET_MIN_ANCHOR_OFFSET -256
14011
14012 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14013    byte offset; we can do much more for larger data types, but have no way
14014    to determine the size of the access.  We assume accesses are aligned.  */
14015 #undef TARGET_MAX_ANCHOR_OFFSET
14016 #define TARGET_MAX_ANCHOR_OFFSET 4095
14017
14018 #undef TARGET_VECTOR_ALIGNMENT
14019 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14020
14021 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14022 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14023   aarch64_simd_vector_alignment_reachable
14024
14025 /* vec_perm support.  */
14026
14027 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14028 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14029   aarch64_vectorize_vec_perm_const_ok
14030
14031 #undef TARGET_INIT_LIBFUNCS
14032 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14033
14034 #undef TARGET_FIXED_CONDITION_CODE_REGS
14035 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14036
14037 #undef TARGET_FLAGS_REGNUM
14038 #define TARGET_FLAGS_REGNUM CC_REGNUM
14039
14040 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14041 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14042
14043 #undef TARGET_ASAN_SHADOW_OFFSET
14044 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14045
14046 #undef TARGET_LEGITIMIZE_ADDRESS
14047 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14048
14049 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14050 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14051   aarch64_use_by_pieces_infrastructure_p
14052
14053 #undef TARGET_CAN_USE_DOLOOP_P
14054 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14055
14056 #undef TARGET_SCHED_MACRO_FUSION_P
14057 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14058
14059 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14060 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14061
14062 #undef TARGET_SCHED_FUSION_PRIORITY
14063 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14064
14065 #undef TARGET_UNSPEC_MAY_TRAP_P
14066 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14067
14068 #undef TARGET_USE_PSEUDO_PIC_REG
14069 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14070
14071 #undef TARGET_PRINT_OPERAND
14072 #define TARGET_PRINT_OPERAND aarch64_print_operand
14073
14074 #undef TARGET_PRINT_OPERAND_ADDRESS
14075 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14076
14077 #undef TARGET_OPTAB_SUPPORTED_P
14078 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14079
14080 #undef TARGET_OMIT_STRUCT_RETURN_REG
14081 #define TARGET_OMIT_STRUCT_RETURN_REG true
14082
14083 struct gcc_target targetm = TARGET_INITIALIZER;
14084
14085 #include "gt-aarch64.h"