gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2016 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "backend.h"
  25 #include "target.h"
  26 #include "rtl.h"
  27 #include "tree.h"
  28 #include "gimple.h"
  29 #include "cfghooks.h"
  30 #include "cfgloop.h"
  31 #include "df.h"
  32 #include "tm_p.h"
  33 #include "stringpool.h"
  34 #include "optabs.h"
  35 #include "regs.h"
  36 #include "emit-rtl.h"
  37 #include "recog.h"
  38 #include "diagnostic.h"
  39 #include "insn-attr.h"
  40 #include "alias.h"
  41 #include "fold-const.h"
  42 #include "stor-layout.h"
  43 #include "calls.h"
  44 #include "varasm.h"
  45 #include "output.h"
  46 #include "flags.h"
  47 #include "explow.h"
  48 #include "expr.h"
  49 #include "reload.h"
  50 #include "langhooks.h"
  51 #include "opts.h"
  52 #include "params.h"
  53 #include "gimplify.h"
  54 #include "dwarf2.h"
  55 #include "gimple-iterator.h"
  56 #include "tree-vectorizer.h"
  57 #include "aarch64-cost-tables.h"
  58 #include "dumpfile.h"
  59 #include "builtins.h"
  60 #include "rtl-iter.h"
  61 #include "tm-constrs.h"
  62 #include "sched-int.h"
  63 #include "cortex-a57-fma-steering.h"
  64 #include "target-globals.h"
  65 #include "common/common-target.h"
  66
  67 /* This file should be included last.  */
  68 #include "target-def.h"
  69
  70 /* Defined for convenience.  */
  71 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  72
  73 /* Classifies an address.
  74
  75    ADDRESS_REG_IMM
  76        A simple base register plus immediate offset.
  77
  78    ADDRESS_REG_WB
  79        A base register indexed by immediate offset with writeback.
  80
  81    ADDRESS_REG_REG
  82        A base register indexed by (optionally scaled) register.
  83
  84    ADDRESS_REG_UXTW
  85        A base register indexed by (optionally scaled) zero-extended register.
  86
  87    ADDRESS_REG_SXTW
  88        A base register indexed by (optionally scaled) sign-extended register.
  89
  90    ADDRESS_LO_SUM
  91        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  92
  93    ADDRESS_SYMBOLIC:
  94        A constant symbolic address, in pc-relative literal pool.  */
  95
  96 enum aarch64_address_type {
  97   ADDRESS_REG_IMM,
  98   ADDRESS_REG_WB,
  99   ADDRESS_REG_REG,
 100   ADDRESS_REG_UXTW,
 101   ADDRESS_REG_SXTW,
 102   ADDRESS_LO_SUM,
 103   ADDRESS_SYMBOLIC
 104 };
 105
 106 struct aarch64_address_info {
 107   enum aarch64_address_type type;
 108   rtx base;
 109   rtx offset;
 110   int shift;
 111   enum aarch64_symbol_type symbol_type;
 112 };
 113
 114 struct simd_immediate_info
 115 {
 116   rtx value;
 117   int shift;
 118   int element_width;
 119   bool mvn;
 120   bool msl;
 121 };
 122
 123 /* The current code model.  */
 124 enum aarch64_code_model aarch64_cmodel;
 125
 126 #ifdef HAVE_AS_TLS
 127 #undef TARGET_HAVE_TLS
 128 #define TARGET_HAVE_TLS 1
 129 #endif
 130
 131 static bool aarch64_composite_type_p (const_tree, machine_mode);
 132 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 133                                                      const_tree,
 134                                                      machine_mode *, int *,
 135                                                      bool *);
 136 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 137 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 138 static void aarch64_override_options_after_change (void);
 139 static bool aarch64_vector_mode_supported_p (machine_mode);
 140 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 141                                                  const unsigned char *sel);
 142 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 143
 144 /* Major revision number of the ARM Architecture implemented by the target.  */
 145 unsigned aarch64_architecture_version;
 146
 147 /* The processor for which instructions should be scheduled.  */
 148 enum aarch64_processor aarch64_tune = cortexa53;
 149
 150 /* Mask to specify which instruction scheduling options should be used.  */
 151 unsigned long aarch64_tune_flags = 0;
 152
 153 /* Global flag for PC relative loads.  */
 154 bool aarch64_nopcrelative_literal_loads;
 155
 156 /* Support for command line parsing of boolean flags in the tuning
 157    structures.  */
 158 struct aarch64_flag_desc
 159 {
 160   const char* name;
 161   unsigned int flag;
 162 };
 163
 164 #define AARCH64_FUSION_PAIR(name, internal_name) \
 165   { name, AARCH64_FUSE_##internal_name },
 166 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 167 {
 168   { "none", AARCH64_FUSE_NOTHING },
 169 #include "aarch64-fusion-pairs.def"
 170   { "all", AARCH64_FUSE_ALL },
 171   { NULL, AARCH64_FUSE_NOTHING }
 172 };
 173 #undef AARCH64_FUION_PAIR
 174
 175 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 176   { name, AARCH64_EXTRA_TUNE_##internal_name },
 177 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 178 {
 179   { "none", AARCH64_EXTRA_TUNE_NONE },
 180 #include "aarch64-tuning-flags.def"
 181   { "all", AARCH64_EXTRA_TUNE_ALL },
 182   { NULL, AARCH64_EXTRA_TUNE_NONE }
 183 };
 184 #undef AARCH64_EXTRA_TUNING_OPTION
 185
 186 /* Tuning parameters.  */
 187
 188 static const struct cpu_addrcost_table generic_addrcost_table =
 189 {
 190     {
 191       0, /* hi  */
 192       0, /* si  */
 193       0, /* di  */
 194       0, /* ti  */
 195     },
 196   0, /* pre_modify  */
 197   0, /* post_modify  */
 198   0, /* register_offset  */
 199   0, /* register_sextend  */
 200   0, /* register_zextend  */
 201   0 /* imm_offset  */
 202 };
 203
 204 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 205 {
 206     {
 207       1, /* hi  */
 208       0, /* si  */
 209       0, /* di  */
 210       1, /* ti  */
 211     },
 212   0, /* pre_modify  */
 213   0, /* post_modify  */
 214   0, /* register_offset  */
 215   0, /* register_sextend  */
 216   0, /* register_zextend  */
 217   0, /* imm_offset  */
 218 };
 219
 220 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 221 {
 222     {
 223       0, /* hi  */
 224       0, /* si  */
 225       0, /* di  */
 226       2, /* ti  */
 227     },
 228   0, /* pre_modify  */
 229   0, /* post_modify  */
 230   1, /* register_offset  */
 231   1, /* register_sextend  */
 232   2, /* register_zextend  */
 233   0, /* imm_offset  */
 234 };
 235
 236 static const struct cpu_addrcost_table xgene1_addrcost_table =
 237 {
 238     {
 239       1, /* hi  */
 240       0, /* si  */
 241       0, /* di  */
 242       1, /* ti  */
 243     },
 244   1, /* pre_modify  */
 245   0, /* post_modify  */
 246   0, /* register_offset  */
 247   1, /* register_sextend  */
 248   1, /* register_zextend  */
 249   0, /* imm_offset  */
 250 };
 251
 252 static const struct cpu_regmove_cost generic_regmove_cost =
 253 {
 254   1, /* GP2GP  */
 255   /* Avoid the use of slow int<->fp moves for spilling by setting
 256      their cost higher than memmov_cost.  */
 257   5, /* GP2FP  */
 258   5, /* FP2GP  */
 259   2 /* FP2FP  */
 260 };
 261
 262 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 263 {
 264   1, /* GP2GP  */
 265   /* Avoid the use of slow int<->fp moves for spilling by setting
 266      their cost higher than memmov_cost.  */
 267   5, /* GP2FP  */
 268   5, /* FP2GP  */
 269   2 /* FP2FP  */
 270 };
 271
 272 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 273 {
 274   1, /* GP2GP  */
 275   /* Avoid the use of slow int<->fp moves for spilling by setting
 276      their cost higher than memmov_cost.  */
 277   5, /* GP2FP  */
 278   5, /* FP2GP  */
 279   2 /* FP2FP  */
 280 };
 281
 282 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 283 {
 284   1, /* GP2GP  */
 285   /* Avoid the use of slow int<->fp moves for spilling by setting
 286      their cost higher than memmov_cost (actual, 4 and 9).  */
 287   9, /* GP2FP  */
 288   9, /* FP2GP  */
 289   1 /* FP2FP  */
 290 };
 291
 292 static const struct cpu_regmove_cost thunderx_regmove_cost =
 293 {
 294   2, /* GP2GP  */
 295   2, /* GP2FP  */
 296   6, /* FP2GP  */
 297   4 /* FP2FP  */
 298 };
 299
 300 static const struct cpu_regmove_cost xgene1_regmove_cost =
 301 {
 302   1, /* GP2GP  */
 303   /* Avoid the use of slow int<->fp moves for spilling by setting
 304      their cost higher than memmov_cost.  */
 305   8, /* GP2FP  */
 306   8, /* FP2GP  */
 307   2 /* FP2FP  */
 308 };
 309
 310 /* Generic costs for vector insn classes.  */
 311 static const struct cpu_vector_cost generic_vector_cost =
 312 {
 313   1, /* scalar_stmt_cost  */
 314   1, /* scalar_load_cost  */
 315   1, /* scalar_store_cost  */
 316   1, /* vec_stmt_cost  */
 317   1, /* vec_to_scalar_cost  */
 318   1, /* scalar_to_vec_cost  */
 319   1, /* vec_align_load_cost  */
 320   1, /* vec_unalign_load_cost  */
 321   1, /* vec_unalign_store_cost  */
 322   1, /* vec_store_cost  */
 323   3, /* cond_taken_branch_cost  */
 324   1 /* cond_not_taken_branch_cost  */
 325 };
 326
 327 /* Generic costs for vector insn classes.  */
 328 static const struct cpu_vector_cost cortexa57_vector_cost =
 329 {
 330   1, /* scalar_stmt_cost  */
 331   4, /* scalar_load_cost  */
 332   1, /* scalar_store_cost  */
 333   3, /* vec_stmt_cost  */
 334   8, /* vec_to_scalar_cost  */
 335   8, /* scalar_to_vec_cost  */
 336   5, /* vec_align_load_cost  */
 337   5, /* vec_unalign_load_cost  */
 338   1, /* vec_unalign_store_cost  */
 339   1, /* vec_store_cost  */
 340   1, /* cond_taken_branch_cost  */
 341   1 /* cond_not_taken_branch_cost  */
 342 };
 343
 344 static const struct cpu_vector_cost exynosm1_vector_cost =
 345 {
 346   1, /* scalar_stmt_cost  */
 347   5, /* scalar_load_cost  */
 348   1, /* scalar_store_cost  */
 349   3, /* vec_stmt_cost  */
 350   3, /* vec_to_scalar_cost  */
 351   3, /* scalar_to_vec_cost  */
 352   5, /* vec_align_load_cost  */
 353   5, /* vec_unalign_load_cost  */
 354   1, /* vec_unalign_store_cost  */
 355   1, /* vec_store_cost  */
 356   1, /* cond_taken_branch_cost  */
 357   1 /* cond_not_taken_branch_cost  */
 358 };
 359
 360 /* Generic costs for vector insn classes.  */
 361 static const struct cpu_vector_cost xgene1_vector_cost =
 362 {
 363   1, /* scalar_stmt_cost  */
 364   5, /* scalar_load_cost  */
 365   1, /* scalar_store_cost  */
 366   2, /* vec_stmt_cost  */
 367   4, /* vec_to_scalar_cost  */
 368   4, /* scalar_to_vec_cost  */
 369   10, /* vec_align_load_cost  */
 370   10, /* vec_unalign_load_cost  */
 371   2, /* vec_unalign_store_cost  */
 372   2, /* vec_store_cost  */
 373   2, /* cond_taken_branch_cost  */
 374   1 /* cond_not_taken_branch_cost  */
 375 };
 376
 377 /* Generic costs for branch instructions.  */
 378 static const struct cpu_branch_cost generic_branch_cost =
 379 {
 380   2,  /* Predictable.  */
 381   2   /* Unpredictable.  */
 382 };
 383
 384 /* Branch costs for Cortex-A57.  */
 385 static const struct cpu_branch_cost cortexa57_branch_cost =
 386 {
 387   1,  /* Predictable.  */
 388   3   /* Unpredictable.  */
 389 };
 390
 391 static const struct tune_params generic_tunings =
 392 {
 393   &cortexa57_extra_costs,
 394   &generic_addrcost_table,
 395   &generic_regmove_cost,
 396   &generic_vector_cost,
 397   &generic_branch_cost,
 398   4, /* memmov_cost  */
 399   2, /* issue_rate  */
 400   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 401   8,    /* function_align.  */
 402   8,    /* jump_align.  */
 403   4,    /* loop_align.  */
 404   2,    /* int_reassoc_width.  */
 405   4,    /* fp_reassoc_width.  */
 406   1,    /* vec_reassoc_width.  */
 407   2,    /* min_div_recip_mul_sf.  */
 408   2,    /* min_div_recip_mul_df.  */
 409   0,    /* max_case_values.  */
 410   0,    /* cache_line_size.  */
 411   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 412   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 413 };
 414
 415 static const struct tune_params cortexa35_tunings =
 416 {
 417   &cortexa53_extra_costs,
 418   &generic_addrcost_table,
 419   &cortexa53_regmove_cost,
 420   &generic_vector_cost,
 421   &generic_branch_cost,
 422   4, /* memmov_cost  */
 423   1, /* issue_rate  */
 424   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 425    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 426   8,    /* function_align.  */
 427   8,    /* jump_align.  */
 428   4,    /* loop_align.  */
 429   2,    /* int_reassoc_width.  */
 430   4,    /* fp_reassoc_width.  */
 431   1,    /* vec_reassoc_width.  */
 432   2,    /* min_div_recip_mul_sf.  */
 433   2,    /* min_div_recip_mul_df.  */
 434   0,    /* max_case_values.  */
 435   0,    /* cache_line_size.  */
 436   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 437   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 438 };
 439
 440 static const struct tune_params cortexa53_tunings =
 441 {
 442   &cortexa53_extra_costs,
 443   &generic_addrcost_table,
 444   &cortexa53_regmove_cost,
 445   &generic_vector_cost,
 446   &generic_branch_cost,
 447   4, /* memmov_cost  */
 448   2, /* issue_rate  */
 449   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 450    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 451   8,    /* function_align.  */
 452   8,    /* jump_align.  */
 453   4,    /* loop_align.  */
 454   2,    /* int_reassoc_width.  */
 455   4,    /* fp_reassoc_width.  */
 456   1,    /* vec_reassoc_width.  */
 457   2,    /* min_div_recip_mul_sf.  */
 458   2,    /* min_div_recip_mul_df.  */
 459   0,    /* max_case_values.  */
 460   0,    /* cache_line_size.  */
 461   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 462   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 463 };
 464
 465 static const struct tune_params cortexa57_tunings =
 466 {
 467   &cortexa57_extra_costs,
 468   &cortexa57_addrcost_table,
 469   &cortexa57_regmove_cost,
 470   &cortexa57_vector_cost,
 471   &cortexa57_branch_cost,
 472   4, /* memmov_cost  */
 473   3, /* issue_rate  */
 474   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 475    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 476   16,   /* function_align.  */
 477   8,    /* jump_align.  */
 478   4,    /* loop_align.  */
 479   2,    /* int_reassoc_width.  */
 480   4,    /* fp_reassoc_width.  */
 481   1,    /* vec_reassoc_width.  */
 482   2,    /* min_div_recip_mul_sf.  */
 483   2,    /* min_div_recip_mul_df.  */
 484   0,    /* max_case_values.  */
 485   0,    /* cache_line_size.  */
 486   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 487   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
 488    | AARCH64_EXTRA_TUNE_RECIP_SQRT)     /* tune_flags.  */
 489 };
 490
 491 static const struct tune_params cortexa72_tunings =
 492 {
 493   &cortexa57_extra_costs,
 494   &cortexa57_addrcost_table,
 495   &cortexa57_regmove_cost,
 496   &cortexa57_vector_cost,
 497   &generic_branch_cost,
 498   4, /* memmov_cost  */
 499   3, /* issue_rate  */
 500   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 501    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 502   16,   /* function_align.  */
 503   8,    /* jump_align.  */
 504   4,    /* loop_align.  */
 505   2,    /* int_reassoc_width.  */
 506   4,    /* fp_reassoc_width.  */
 507   1,    /* vec_reassoc_width.  */
 508   2,    /* min_div_recip_mul_sf.  */
 509   2,    /* min_div_recip_mul_df.  */
 510   0,    /* max_case_values.  */
 511   0,    /* cache_line_size.  */
 512   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 513   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 514 };
 515
 516 static const struct tune_params exynosm1_tunings =
 517 {
 518   &exynosm1_extra_costs,
 519   &exynosm1_addrcost_table,
 520   &exynosm1_regmove_cost,
 521   &exynosm1_vector_cost,
 522   &generic_branch_cost,
 523   4,    /* memmov_cost  */
 524   3,    /* issue_rate  */
 525   (AARCH64_FUSE_NOTHING), /* fusible_ops  */
 526   4,    /* function_align.  */
 527   4,    /* jump_align.  */
 528   4,    /* loop_align.  */
 529   2,    /* int_reassoc_width.  */
 530   4,    /* fp_reassoc_width.  */
 531   1,    /* vec_reassoc_width.  */
 532   2,    /* min_div_recip_mul_sf.  */
 533   2,    /* min_div_recip_mul_df.  */
 534   48,   /* max_case_values.  */
 535   64,   /* cache_line_size.  */
 536   tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model.  */
 537   (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
 538 };
 539
 540 static const struct tune_params thunderx_tunings =
 541 {
 542   &thunderx_extra_costs,
 543   &generic_addrcost_table,
 544   &thunderx_regmove_cost,
 545   &generic_vector_cost,
 546   &generic_branch_cost,
 547   6, /* memmov_cost  */
 548   2, /* issue_rate  */
 549   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 550   8,    /* function_align.  */
 551   8,    /* jump_align.  */
 552   8,    /* loop_align.  */
 553   2,    /* int_reassoc_width.  */
 554   4,    /* fp_reassoc_width.  */
 555   1,    /* vec_reassoc_width.  */
 556   2,    /* min_div_recip_mul_sf.  */
 557   2,    /* min_div_recip_mul_df.  */
 558   0,    /* max_case_values.  */
 559   0,    /* cache_line_size.  */
 560   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 561   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 562 };
 563
 564 static const struct tune_params xgene1_tunings =
 565 {
 566   &xgene1_extra_costs,
 567   &xgene1_addrcost_table,
 568   &xgene1_regmove_cost,
 569   &xgene1_vector_cost,
 570   &generic_branch_cost,
 571   6, /* memmov_cost  */
 572   4, /* issue_rate  */
 573   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 574   16,   /* function_align.  */
 575   8,    /* jump_align.  */
 576   16,   /* loop_align.  */
 577   2,    /* int_reassoc_width.  */
 578   4,    /* fp_reassoc_width.  */
 579   1,    /* vec_reassoc_width.  */
 580   2,    /* min_div_recip_mul_sf.  */
 581   2,    /* min_div_recip_mul_df.  */
 582   0,    /* max_case_values.  */
 583   0,    /* cache_line_size.  */
 584   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 585   (AARCH64_EXTRA_TUNE_RECIP_SQRT)       /* tune_flags.  */
 586 };
 587
 588 /* Support for fine-grained override of the tuning structures.  */
 589 struct aarch64_tuning_override_function
 590 {
 591   const char* name;
 592   void (*parse_override)(const char*, struct tune_params*);
 593 };
 594
 595 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 596 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 597
 598 static const struct aarch64_tuning_override_function
 599 aarch64_tuning_override_functions[] =
 600 {
 601   { "fuse", aarch64_parse_fuse_string },
 602   { "tune", aarch64_parse_tune_string },
 603   { NULL, NULL }
 604 };
 605
 606 /* A processor implementing AArch64.  */
 607 struct processor
 608 {
 609   const char *const name;
 610   enum aarch64_processor ident;
 611   enum aarch64_processor sched_core;
 612   enum aarch64_arch arch;
 613   unsigned architecture_version;
 614   const unsigned long flags;
 615   const struct tune_params *const tune;
 616 };
 617
 618 /* Architectures implementing AArch64.  */
 619 static const struct processor all_architectures[] =
 620 {
 621 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 622   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 623 #include "aarch64-arches.def"
 624 #undef AARCH64_ARCH
 625   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 626 };
 627
 628 /* Processor cores implementing AArch64.  */
 629 static const struct processor all_cores[] =
 630 {
 631 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 632   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 633   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 634   FLAGS, &COSTS##_tunings},
 635 #include "aarch64-cores.def"
 636 #undef AARCH64_CORE
 637   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 638     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 639   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 640 };
 641
 642
 643 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 644    handling code or by target attributes.  */
 645 static const struct processor *selected_arch;
 646 static const struct processor *selected_cpu;
 647 static const struct processor *selected_tune;
 648
 649 /* The current tuning set.  */
 650 struct tune_params aarch64_tune_params = generic_tunings;
 651
 652 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 653
 654 /* An ISA extension in the co-processor and main instruction set space.  */
 655 struct aarch64_option_extension
 656 {
 657   const char *const name;
 658   const unsigned long flags_on;
 659   const unsigned long flags_off;
 660 };
 661
 662 /* ISA extensions in AArch64.  */
 663 static const struct aarch64_option_extension all_extensions[] =
 664 {
 665 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
 666   {NAME, FLAGS_ON, FLAGS_OFF},
 667 #include "aarch64-option-extensions.def"
 668 #undef AARCH64_OPT_EXTENSION
 669   {NULL, 0, 0}
 670 };
 671
 672 typedef enum aarch64_cond_code
 673 {
 674   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 675   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 676   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 677 }
 678 aarch64_cc;
 679
 680 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 681
 682 /* The condition codes of the processor, and the inverse function.  */
 683 static const char * const aarch64_condition_codes[] =
 684 {
 685   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 686   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 687 };
 688
 689 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 690 const char *
 691 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 692                         const char * branch_format)
 693 {
 694     rtx_code_label * tmp_label = gen_label_rtx ();
 695     char label_buf[256];
 696     char buffer[128];
 697     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 698                                  CODE_LABEL_NUMBER (tmp_label));
 699     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 700     rtx dest_label = operands[pos_label];
 701     operands[pos_label] = tmp_label;
 702
 703     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 704     output_asm_insn (buffer, operands);
 705
 706     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 707     operands[pos_label] = dest_label;
 708     output_asm_insn (buffer, operands);
 709     return "";
 710 }
 711
 712 void
 713 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 714 {
 715   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 716   if (TARGET_GENERAL_REGS_ONLY)
 717     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 718   else
 719     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 720 }
 721
 722 static unsigned int
 723 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 724 {
 725   if (GET_MODE_UNIT_SIZE (mode) == 4)
 726     return aarch64_tune_params.min_div_recip_mul_sf;
 727   return aarch64_tune_params.min_div_recip_mul_df;
 728 }
 729
 730 static int
 731 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 732                              enum machine_mode mode)
 733 {
 734   if (VECTOR_MODE_P (mode))
 735     return aarch64_tune_params.vec_reassoc_width;
 736   if (INTEGRAL_MODE_P (mode))
 737     return aarch64_tune_params.int_reassoc_width;
 738   if (FLOAT_MODE_P (mode))
 739     return aarch64_tune_params.fp_reassoc_width;
 740   return 1;
 741 }
 742
 743 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 744 unsigned
 745 aarch64_dbx_register_number (unsigned regno)
 746 {
 747    if (GP_REGNUM_P (regno))
 748      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 749    else if (regno == SP_REGNUM)
 750      return AARCH64_DWARF_SP;
 751    else if (FP_REGNUM_P (regno))
 752      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 753
 754    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 755       equivalent DWARF register.  */
 756    return DWARF_FRAME_REGISTERS;
 757 }
 758
 759 /* Return TRUE if MODE is any of the large INT modes.  */
 760 static bool
 761 aarch64_vect_struct_mode_p (machine_mode mode)
 762 {
 763   return mode == OImode || mode == CImode || mode == XImode;
 764 }
 765
 766 /* Return TRUE if MODE is any of the vector modes.  */
 767 static bool
 768 aarch64_vector_mode_p (machine_mode mode)
 769 {
 770   return aarch64_vector_mode_supported_p (mode)
 771          || aarch64_vect_struct_mode_p (mode);
 772 }
 773
 774 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 775 static bool
 776 aarch64_array_mode_supported_p (machine_mode mode,
 777                                 unsigned HOST_WIDE_INT nelems)
 778 {
 779   if (TARGET_SIMD
 780       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
 781           || AARCH64_VALID_SIMD_DREG_MODE (mode))
 782       && (nelems >= 2 && nelems <= 4))
 783     return true;
 784
 785   return false;
 786 }
 787
 788 /* Implement HARD_REGNO_NREGS.  */
 789
 790 int
 791 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 792 {
 793   switch (aarch64_regno_regclass (regno))
 794     {
 795     case FP_REGS:
 796     case FP_LO_REGS:
 797       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 798     default:
 799       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 800     }
 801   gcc_unreachable ();
 802 }
 803
 804 /* Implement HARD_REGNO_MODE_OK.  */
 805
 806 int
 807 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 808 {
 809   if (GET_MODE_CLASS (mode) == MODE_CC)
 810     return regno == CC_REGNUM;
 811
 812   if (regno == SP_REGNUM)
 813     /* The purpose of comparing with ptr_mode is to support the
 814        global register variable associated with the stack pointer
 815        register via the syntax of asm ("wsp") in ILP32.  */
 816     return mode == Pmode || mode == ptr_mode;
 817
 818   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 819     return mode == Pmode;
 820
 821   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 822     return 1;
 823
 824   if (FP_REGNUM_P (regno))
 825     {
 826       if (aarch64_vect_struct_mode_p (mode))
 827         return
 828           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 829       else
 830         return 1;
 831     }
 832
 833   return 0;
 834 }
 835
 836 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 837 machine_mode
 838 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 839                                      machine_mode mode)
 840 {
 841   /* Handle modes that fit within single registers.  */
 842   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 843     {
 844       if (GET_MODE_SIZE (mode) >= 4)
 845         return mode;
 846       else
 847         return SImode;
 848     }
 849   /* Fall back to generic for multi-reg and very large modes.  */
 850   else
 851     return choose_hard_reg_mode (regno, nregs, false);
 852 }
 853
 854 /* Return true if calls to DECL should be treated as
 855    long-calls (ie called via a register).  */
 856 static bool
 857 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 858 {
 859   return false;
 860 }
 861
 862 /* Return true if calls to symbol-ref SYM should be treated as
 863    long-calls (ie called via a register).  */
 864 bool
 865 aarch64_is_long_call_p (rtx sym)
 866 {
 867   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 868 }
 869
 870 /* Return true if calls to symbol-ref SYM should not go through
 871    plt stubs.  */
 872
 873 bool
 874 aarch64_is_noplt_call_p (rtx sym)
 875 {
 876   const_tree decl = SYMBOL_REF_DECL (sym);
 877
 878   if (flag_pic
 879       && decl
 880       && (!flag_plt
 881           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
 882       && !targetm.binds_local_p (decl))
 883     return true;
 884
 885   return false;
 886 }
 887
 888 /* Return true if the offsets to a zero/sign-extract operation
 889    represent an expression that matches an extend operation.  The
 890    operands represent the paramters from
 891
 892    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 893 bool
 894 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 895                                 rtx extract_imm)
 896 {
 897   HOST_WIDE_INT mult_val, extract_val;
 898
 899   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 900     return false;
 901
 902   mult_val = INTVAL (mult_imm);
 903   extract_val = INTVAL (extract_imm);
 904
 905   if (extract_val > 8
 906       && extract_val < GET_MODE_BITSIZE (mode)
 907       && exact_log2 (extract_val & ~7) > 0
 908       && (extract_val & 7) <= 4
 909       && mult_val == (1 << (extract_val & 7)))
 910     return true;
 911
 912   return false;
 913 }
 914
 915 /* Emit an insn that's a simple single-set.  Both the operands must be
 916    known to be valid.  */
 917 inline static rtx
 918 emit_set_insn (rtx x, rtx y)
 919 {
 920   return emit_insn (gen_rtx_SET (x, y));
 921 }
 922
 923 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 924    return the rtx for register 0 in the proper mode.  */
 925 rtx
 926 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 927 {
 928   machine_mode mode = SELECT_CC_MODE (code, x, y);
 929   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 930
 931   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 932   return cc_reg;
 933 }
 934
 935 /* Build the SYMBOL_REF for __tls_get_addr.  */
 936
 937 static GTY(()) rtx tls_get_addr_libfunc;
 938
 939 rtx
 940 aarch64_tls_get_addr (void)
 941 {
 942   if (!tls_get_addr_libfunc)
 943     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 944   return tls_get_addr_libfunc;
 945 }
 946
 947 /* Return the TLS model to use for ADDR.  */
 948
 949 static enum tls_model
 950 tls_symbolic_operand_type (rtx addr)
 951 {
 952   enum tls_model tls_kind = TLS_MODEL_NONE;
 953   rtx sym, addend;
 954
 955   if (GET_CODE (addr) == CONST)
 956     {
 957       split_const (addr, &sym, &addend);
 958       if (GET_CODE (sym) == SYMBOL_REF)
 959         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 960     }
 961   else if (GET_CODE (addr) == SYMBOL_REF)
 962     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 963
 964   return tls_kind;
 965 }
 966
 967 /* We'll allow lo_sum's in addresses in our legitimate addresses
 968    so that combine would take care of combining addresses where
 969    necessary, but for generation purposes, we'll generate the address
 970    as :
 971    RTL                               Absolute
 972    tmp = hi (symbol_ref);            adrp  x1, foo
 973    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 974                                      nop
 975
 976    PIC                               TLS
 977    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 978    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 979                                      bl   __tls_get_addr
 980                                      nop
 981
 982    Load TLS symbol, depending on TLS mechanism and TLS access model.
 983
 984    Global Dynamic - Traditional TLS:
 985    adrp tmp, :tlsgd:imm
 986    add  dest, tmp, #:tlsgd_lo12:imm
 987    bl   __tls_get_addr
 988
 989    Global Dynamic - TLS Descriptors:
 990    adrp dest, :tlsdesc:imm
 991    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 992    add  dest, dest, #:tlsdesc_lo12:imm
 993    blr  tmp
 994    mrs  tp, tpidr_el0
 995    add  dest, dest, tp
 996
 997    Initial Exec:
 998    mrs  tp, tpidr_el0
 999    adrp tmp, :gottprel:imm
1000    ldr  dest, [tmp, #:gottprel_lo12:imm]
1001    add  dest, dest, tp
1002
1003    Local Exec:
1004    mrs  tp, tpidr_el0
1005    add  t0, tp, #:tprel_hi12:imm, lsl #12
1006    add  t0, t0, #:tprel_lo12_nc:imm
1007 */
1008
1009 static void
1010 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1011                                    enum aarch64_symbol_type type)
1012 {
1013   switch (type)
1014     {
1015     case SYMBOL_SMALL_ABSOLUTE:
1016       {
1017         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1018         rtx tmp_reg = dest;
1019         machine_mode mode = GET_MODE (dest);
1020
1021         gcc_assert (mode == Pmode || mode == ptr_mode);
1022
1023         if (can_create_pseudo_p ())
1024           tmp_reg = gen_reg_rtx (mode);
1025
1026         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1027         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1028         return;
1029       }
1030
1031     case SYMBOL_TINY_ABSOLUTE:
1032       emit_insn (gen_rtx_SET (dest, imm));
1033       return;
1034
1035     case SYMBOL_SMALL_GOT_28K:
1036       {
1037         machine_mode mode = GET_MODE (dest);
1038         rtx gp_rtx = pic_offset_table_rtx;
1039         rtx insn;
1040         rtx mem;
1041
1042         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1043            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1044            decide rtx costs, in which case pic_offset_table_rtx is not
1045            initialized.  For that case no need to generate the first adrp
1046            instruction as the final cost for global variable access is
1047            one instruction.  */
1048         if (gp_rtx != NULL)
1049           {
1050             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1051                using the page base as GOT base, the first page may be wasted,
1052                in the worst scenario, there is only 28K space for GOT).
1053
1054                The generate instruction sequence for accessing global variable
1055                is:
1056
1057                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1058
1059                Only one instruction needed. But we must initialize
1060                pic_offset_table_rtx properly.  We generate initialize insn for
1061                every global access, and allow CSE to remove all redundant.
1062
1063                The final instruction sequences will look like the following
1064                for multiply global variables access.
1065
1066                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1067
1068                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1069                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1070                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1071                  ...  */
1072
1073             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1074             crtl->uses_pic_offset_table = 1;
1075             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1076
1077             if (mode != GET_MODE (gp_rtx))
1078               gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
1079           }
1080
1081         if (mode == ptr_mode)
1082           {
1083             if (mode == DImode)
1084               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1085             else
1086               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1087
1088             mem = XVECEXP (SET_SRC (insn), 0, 0);
1089           }
1090         else
1091           {
1092             gcc_assert (mode == Pmode);
1093
1094             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1095             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1096           }
1097
1098         /* The operand is expected to be MEM.  Whenever the related insn
1099            pattern changed, above code which calculate mem should be
1100            updated.  */
1101         gcc_assert (GET_CODE (mem) == MEM);
1102         MEM_READONLY_P (mem) = 1;
1103         MEM_NOTRAP_P (mem) = 1;
1104         emit_insn (insn);
1105         return;
1106       }
1107
1108     case SYMBOL_SMALL_GOT_4G:
1109       {
1110         /* In ILP32, the mode of dest can be either SImode or DImode,
1111            while the got entry is always of SImode size.  The mode of
1112            dest depends on how dest is used: if dest is assigned to a
1113            pointer (e.g. in the memory), it has SImode; it may have
1114            DImode if dest is dereferenced to access the memeory.
1115            This is why we have to handle three different ldr_got_small
1116            patterns here (two patterns for ILP32).  */
1117
1118         rtx insn;
1119         rtx mem;
1120         rtx tmp_reg = dest;
1121         machine_mode mode = GET_MODE (dest);
1122
1123         if (can_create_pseudo_p ())
1124           tmp_reg = gen_reg_rtx (mode);
1125
1126         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1127         if (mode == ptr_mode)
1128           {
1129             if (mode == DImode)
1130               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1131             else
1132               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1133
1134             mem = XVECEXP (SET_SRC (insn), 0, 0);
1135           }
1136         else
1137           {
1138             gcc_assert (mode == Pmode);
1139
1140             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1141             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1142           }
1143
1144         gcc_assert (GET_CODE (mem) == MEM);
1145         MEM_READONLY_P (mem) = 1;
1146         MEM_NOTRAP_P (mem) = 1;
1147         emit_insn (insn);
1148         return;
1149       }
1150
1151     case SYMBOL_SMALL_TLSGD:
1152       {
1153         rtx_insn *insns;
1154         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1155
1156         start_sequence ();
1157         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1158         insns = get_insns ();
1159         end_sequence ();
1160
1161         RTL_CONST_CALL_P (insns) = 1;
1162         emit_libcall_block (insns, dest, result, imm);
1163         return;
1164       }
1165
1166     case SYMBOL_SMALL_TLSDESC:
1167       {
1168         machine_mode mode = GET_MODE (dest);
1169         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1170         rtx tp;
1171
1172         gcc_assert (mode == Pmode || mode == ptr_mode);
1173
1174         /* In ILP32, the got entry is always of SImode size.  Unlike
1175            small GOT, the dest is fixed at reg 0.  */
1176         if (TARGET_ILP32)
1177           emit_insn (gen_tlsdesc_small_si (imm));
1178         else
1179           emit_insn (gen_tlsdesc_small_di (imm));
1180         tp = aarch64_load_tp (NULL);
1181
1182         if (mode != Pmode)
1183           tp = gen_lowpart (mode, tp);
1184
1185         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1186         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1187         return;
1188       }
1189
1190     case SYMBOL_SMALL_TLSIE:
1191       {
1192         /* In ILP32, the mode of dest can be either SImode or DImode,
1193            while the got entry is always of SImode size.  The mode of
1194            dest depends on how dest is used: if dest is assigned to a
1195            pointer (e.g. in the memory), it has SImode; it may have
1196            DImode if dest is dereferenced to access the memeory.
1197            This is why we have to handle three different tlsie_small
1198            patterns here (two patterns for ILP32).  */
1199         machine_mode mode = GET_MODE (dest);
1200         rtx tmp_reg = gen_reg_rtx (mode);
1201         rtx tp = aarch64_load_tp (NULL);
1202
1203         if (mode == ptr_mode)
1204           {
1205             if (mode == DImode)
1206               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1207             else
1208               {
1209                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1210                 tp = gen_lowpart (mode, tp);
1211               }
1212           }
1213         else
1214           {
1215             gcc_assert (mode == Pmode);
1216             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1217           }
1218
1219         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1220         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1221         return;
1222       }
1223
1224     case SYMBOL_TLSLE12:
1225     case SYMBOL_TLSLE24:
1226     case SYMBOL_TLSLE32:
1227     case SYMBOL_TLSLE48:
1228       {
1229         machine_mode mode = GET_MODE (dest);
1230         rtx tp = aarch64_load_tp (NULL);
1231
1232         if (mode != Pmode)
1233           tp = gen_lowpart (mode, tp);
1234
1235         switch (type)
1236           {
1237           case SYMBOL_TLSLE12:
1238             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1239                         (dest, tp, imm));
1240             break;
1241           case SYMBOL_TLSLE24:
1242             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1243                         (dest, tp, imm));
1244           break;
1245           case SYMBOL_TLSLE32:
1246             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1247                         (dest, imm));
1248             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1249                         (dest, dest, tp));
1250           break;
1251           case SYMBOL_TLSLE48:
1252             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1253                         (dest, imm));
1254             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1255                         (dest, dest, tp));
1256             break;
1257           default:
1258             gcc_unreachable ();
1259           }
1260
1261         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1262         return;
1263       }
1264
1265     case SYMBOL_TINY_GOT:
1266       emit_insn (gen_ldr_got_tiny (dest, imm));
1267       return;
1268
1269     case SYMBOL_TINY_TLSIE:
1270       {
1271         machine_mode mode = GET_MODE (dest);
1272         rtx tp = aarch64_load_tp (NULL);
1273
1274         if (mode == ptr_mode)
1275           {
1276             if (mode == DImode)
1277               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1278             else
1279               {
1280                 tp = gen_lowpart (mode, tp);
1281                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1282               }
1283           }
1284         else
1285           {
1286             gcc_assert (mode == Pmode);
1287             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1288           }
1289
1290         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1291         return;
1292       }
1293
1294     default:
1295       gcc_unreachable ();
1296     }
1297 }
1298
1299 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1300    handle all moves if !can_create_pseudo_p ().  The distinction is
1301    important because, unlike emit_move_insn, the move expanders know
1302    how to force Pmode objects into the constant pool even when the
1303    constant pool address is not itself legitimate.  */
1304 static rtx
1305 aarch64_emit_move (rtx dest, rtx src)
1306 {
1307   return (can_create_pseudo_p ()
1308           ? emit_move_insn (dest, src)
1309           : emit_move_insn_1 (dest, src));
1310 }
1311
1312 /* Split a 128-bit move operation into two 64-bit move operations,
1313    taking care to handle partial overlap of register to register
1314    copies.  Special cases are needed when moving between GP regs and
1315    FP regs.  SRC can be a register, constant or memory; DST a register
1316    or memory.  If either operand is memory it must not have any side
1317    effects.  */
1318 void
1319 aarch64_split_128bit_move (rtx dst, rtx src)
1320 {
1321   rtx dst_lo, dst_hi;
1322   rtx src_lo, src_hi;
1323
1324   machine_mode mode = GET_MODE (dst);
1325
1326   gcc_assert (mode == TImode || mode == TFmode);
1327   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1328   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1329
1330   if (REG_P (dst) && REG_P (src))
1331     {
1332       int src_regno = REGNO (src);
1333       int dst_regno = REGNO (dst);
1334
1335       /* Handle FP <-> GP regs.  */
1336       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1337         {
1338           src_lo = gen_lowpart (word_mode, src);
1339           src_hi = gen_highpart (word_mode, src);
1340
1341           if (mode == TImode)
1342             {
1343               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1344               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1345             }
1346           else
1347             {
1348               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1349               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1350             }
1351           return;
1352         }
1353       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1354         {
1355           dst_lo = gen_lowpart (word_mode, dst);
1356           dst_hi = gen_highpart (word_mode, dst);
1357
1358           if (mode == TImode)
1359             {
1360               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1361               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1362             }
1363           else
1364             {
1365               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1366               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1367             }
1368           return;
1369         }
1370     }
1371
1372   dst_lo = gen_lowpart (word_mode, dst);
1373   dst_hi = gen_highpart (word_mode, dst);
1374   src_lo = gen_lowpart (word_mode, src);
1375   src_hi = gen_highpart_mode (word_mode, mode, src);
1376
1377   /* At most one pairing may overlap.  */
1378   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1379     {
1380       aarch64_emit_move (dst_hi, src_hi);
1381       aarch64_emit_move (dst_lo, src_lo);
1382     }
1383   else
1384     {
1385       aarch64_emit_move (dst_lo, src_lo);
1386       aarch64_emit_move (dst_hi, src_hi);
1387     }
1388 }
1389
1390 bool
1391 aarch64_split_128bit_move_p (rtx dst, rtx src)
1392 {
1393   return (! REG_P (src)
1394           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1395 }
1396
1397 /* Split a complex SIMD combine.  */
1398
1399 void
1400 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1401 {
1402   machine_mode src_mode = GET_MODE (src1);
1403   machine_mode dst_mode = GET_MODE (dst);
1404
1405   gcc_assert (VECTOR_MODE_P (dst_mode));
1406
1407   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1408     {
1409       rtx (*gen) (rtx, rtx, rtx);
1410
1411       switch (src_mode)
1412         {
1413         case V8QImode:
1414           gen = gen_aarch64_simd_combinev8qi;
1415           break;
1416         case V4HImode:
1417           gen = gen_aarch64_simd_combinev4hi;
1418           break;
1419         case V2SImode:
1420           gen = gen_aarch64_simd_combinev2si;
1421           break;
1422         case V4HFmode:
1423           gen = gen_aarch64_simd_combinev4hf;
1424           break;
1425         case V2SFmode:
1426           gen = gen_aarch64_simd_combinev2sf;
1427           break;
1428         case DImode:
1429           gen = gen_aarch64_simd_combinedi;
1430           break;
1431         case DFmode:
1432           gen = gen_aarch64_simd_combinedf;
1433           break;
1434         default:
1435           gcc_unreachable ();
1436         }
1437
1438       emit_insn (gen (dst, src1, src2));
1439       return;
1440     }
1441 }
1442
1443 /* Split a complex SIMD move.  */
1444
1445 void
1446 aarch64_split_simd_move (rtx dst, rtx src)
1447 {
1448   machine_mode src_mode = GET_MODE (src);
1449   machine_mode dst_mode = GET_MODE (dst);
1450
1451   gcc_assert (VECTOR_MODE_P (dst_mode));
1452
1453   if (REG_P (dst) && REG_P (src))
1454     {
1455       rtx (*gen) (rtx, rtx);
1456
1457       gcc_assert (VECTOR_MODE_P (src_mode));
1458
1459       switch (src_mode)
1460         {
1461         case V16QImode:
1462           gen = gen_aarch64_split_simd_movv16qi;
1463           break;
1464         case V8HImode:
1465           gen = gen_aarch64_split_simd_movv8hi;
1466           break;
1467         case V4SImode:
1468           gen = gen_aarch64_split_simd_movv4si;
1469           break;
1470         case V2DImode:
1471           gen = gen_aarch64_split_simd_movv2di;
1472           break;
1473         case V8HFmode:
1474           gen = gen_aarch64_split_simd_movv8hf;
1475           break;
1476         case V4SFmode:
1477           gen = gen_aarch64_split_simd_movv4sf;
1478           break;
1479         case V2DFmode:
1480           gen = gen_aarch64_split_simd_movv2df;
1481           break;
1482         default:
1483           gcc_unreachable ();
1484         }
1485
1486       emit_insn (gen (dst, src));
1487       return;
1488     }
1489 }
1490
1491 static rtx
1492 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1493 {
1494   if (can_create_pseudo_p ())
1495     return force_reg (mode, value);
1496   else
1497     {
1498       x = aarch64_emit_move (x, value);
1499       return x;
1500     }
1501 }
1502
1503
1504 static rtx
1505 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1506 {
1507   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1508     {
1509       rtx high;
1510       /* Load the full offset into a register.  This
1511          might be improvable in the future.  */
1512       high = GEN_INT (offset);
1513       offset = 0;
1514       high = aarch64_force_temporary (mode, temp, high);
1515       reg = aarch64_force_temporary (mode, temp,
1516                                      gen_rtx_PLUS (mode, high, reg));
1517     }
1518   return plus_constant (mode, reg, offset);
1519 }
1520
1521 static int
1522 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1523                                 machine_mode mode)
1524 {
1525   int i;
1526   unsigned HOST_WIDE_INT val, val2, mask;
1527   int one_match, zero_match;
1528   int num_insns;
1529
1530   val = INTVAL (imm);
1531
1532   if (aarch64_move_imm (val, mode))
1533     {
1534       if (generate)
1535         emit_insn (gen_rtx_SET (dest, imm));
1536       return 1;
1537     }
1538
1539   if ((val >> 32) == 0 || mode == SImode)
1540     {
1541       if (generate)
1542         {
1543           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1544           if (mode == SImode)
1545             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1546                                        GEN_INT ((val >> 16) & 0xffff)));
1547           else
1548             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1549                                        GEN_INT ((val >> 16) & 0xffff)));
1550         }
1551       return 2;
1552     }
1553
1554   /* Remaining cases are all for DImode.  */
1555
1556   mask = 0xffff;
1557   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1558     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1559   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1560     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1561
1562   if (zero_match != 2 && one_match != 2)
1563     {
1564       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1565          For a 64-bit bitmask try whether changing 16 bits to all ones or
1566          zeroes creates a valid bitmask.  To check any repeated bitmask,
1567          try using 16 bits from the other 32-bit half of val.  */
1568
1569       for (i = 0; i < 64; i += 16, mask <<= 16)
1570         {
1571           val2 = val & ~mask;
1572           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1573             break;
1574           val2 = val | mask;
1575           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1576             break;
1577           val2 = val2 & ~mask;
1578           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1579           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1580             break;
1581         }
1582       if (i != 64)
1583         {
1584           if (generate)
1585             {
1586               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1587               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1588                                          GEN_INT ((val >> i) & 0xffff)));
1589             }
1590         }
1591     }
1592
1593   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1594      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1595      otherwise skip zero bits.  */
1596
1597   num_insns = 1;
1598   mask = 0xffff;
1599   val2 = one_match > zero_match ? ~val : val;
1600   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1601
1602   if (generate)
1603     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1604                                            ? (val | ~(mask << i))
1605                                            : (val & (mask << i)))));
1606   for (i += 16; i < 64; i += 16)
1607     {
1608       if ((val2 & (mask << i)) == 0)
1609         continue;
1610       if (generate)
1611         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1612                                    GEN_INT ((val >> i) & 0xffff)));
1613       num_insns ++;
1614     }
1615
1616   return num_insns;
1617 }
1618
1619
1620 void
1621 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1622 {
1623   machine_mode mode = GET_MODE (dest);
1624
1625   gcc_assert (mode == SImode || mode == DImode);
1626
1627   /* Check on what type of symbol it is.  */
1628   if (GET_CODE (imm) == SYMBOL_REF
1629       || GET_CODE (imm) == LABEL_REF
1630       || GET_CODE (imm) == CONST)
1631     {
1632       rtx mem, base, offset;
1633       enum aarch64_symbol_type sty;
1634
1635       /* If we have (const (plus symbol offset)), separate out the offset
1636          before we start classifying the symbol.  */
1637       split_const (imm, &base, &offset);
1638
1639       sty = aarch64_classify_symbol (base, offset);
1640       switch (sty)
1641         {
1642         case SYMBOL_FORCE_TO_MEM:
1643           if (offset != const0_rtx
1644               && targetm.cannot_force_const_mem (mode, imm))
1645             {
1646               gcc_assert (can_create_pseudo_p ());
1647               base = aarch64_force_temporary (mode, dest, base);
1648               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1649               aarch64_emit_move (dest, base);
1650               return;
1651             }
1652
1653           mem = force_const_mem (ptr_mode, imm);
1654           gcc_assert (mem);
1655
1656           /* If we aren't generating PC relative literals, then
1657              we need to expand the literal pool access carefully.
1658              This is something that needs to be done in a number
1659              of places, so could well live as a separate function.  */
1660           if (aarch64_nopcrelative_literal_loads)
1661             {
1662               gcc_assert (can_create_pseudo_p ());
1663               base = gen_reg_rtx (ptr_mode);
1664               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1665               mem = gen_rtx_MEM (ptr_mode, base);
1666             }
1667
1668           if (mode != ptr_mode)
1669             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1670
1671           emit_insn (gen_rtx_SET (dest, mem));
1672
1673           return;
1674
1675         case SYMBOL_SMALL_TLSGD:
1676         case SYMBOL_SMALL_TLSDESC:
1677         case SYMBOL_SMALL_TLSIE:
1678         case SYMBOL_SMALL_GOT_28K:
1679         case SYMBOL_SMALL_GOT_4G:
1680         case SYMBOL_TINY_GOT:
1681         case SYMBOL_TINY_TLSIE:
1682           if (offset != const0_rtx)
1683             {
1684               gcc_assert(can_create_pseudo_p ());
1685               base = aarch64_force_temporary (mode, dest, base);
1686               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1687               aarch64_emit_move (dest, base);
1688               return;
1689             }
1690           /* FALLTHRU */
1691
1692         case SYMBOL_SMALL_ABSOLUTE:
1693         case SYMBOL_TINY_ABSOLUTE:
1694         case SYMBOL_TLSLE12:
1695         case SYMBOL_TLSLE24:
1696         case SYMBOL_TLSLE32:
1697         case SYMBOL_TLSLE48:
1698           aarch64_load_symref_appropriately (dest, imm, sty);
1699           return;
1700
1701         default:
1702           gcc_unreachable ();
1703         }
1704     }
1705
1706   if (!CONST_INT_P (imm))
1707     {
1708       if (GET_CODE (imm) == HIGH)
1709         emit_insn (gen_rtx_SET (dest, imm));
1710       else
1711         {
1712           rtx mem = force_const_mem (mode, imm);
1713           gcc_assert (mem);
1714           emit_insn (gen_rtx_SET (dest, mem));
1715         }
1716
1717       return;
1718     }
1719
1720   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1721 }
1722
1723 static bool
1724 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1725                                  tree exp ATTRIBUTE_UNUSED)
1726 {
1727   /* Currently, always true.  */
1728   return true;
1729 }
1730
1731 /* Implement TARGET_PASS_BY_REFERENCE.  */
1732
1733 static bool
1734 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1735                            machine_mode mode,
1736                            const_tree type,
1737                            bool named ATTRIBUTE_UNUSED)
1738 {
1739   HOST_WIDE_INT size;
1740   machine_mode dummymode;
1741   int nregs;
1742
1743   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1744   size = (mode == BLKmode && type)
1745     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1746
1747   /* Aggregates are passed by reference based on their size.  */
1748   if (type && AGGREGATE_TYPE_P (type))
1749     {
1750       size = int_size_in_bytes (type);
1751     }
1752
1753   /* Variable sized arguments are always returned by reference.  */
1754   if (size < 0)
1755     return true;
1756
1757   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1758   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1759                                                &dummymode, &nregs,
1760                                                NULL))
1761     return false;
1762
1763   /* Arguments which are variable sized or larger than 2 registers are
1764      passed by reference unless they are a homogenous floating point
1765      aggregate.  */
1766   return size > 2 * UNITS_PER_WORD;
1767 }
1768
1769 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1770 static bool
1771 aarch64_return_in_msb (const_tree valtype)
1772 {
1773   machine_mode dummy_mode;
1774   int dummy_int;
1775
1776   /* Never happens in little-endian mode.  */
1777   if (!BYTES_BIG_ENDIAN)
1778     return false;
1779
1780   /* Only composite types smaller than or equal to 16 bytes can
1781      be potentially returned in registers.  */
1782   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1783       || int_size_in_bytes (valtype) <= 0
1784       || int_size_in_bytes (valtype) > 16)
1785     return false;
1786
1787   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1788      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1789      is always passed/returned in the least significant bits of fp/simd
1790      register(s).  */
1791   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1792                                                &dummy_mode, &dummy_int, NULL))
1793     return false;
1794
1795   return true;
1796 }
1797
1798 /* Implement TARGET_FUNCTION_VALUE.
1799    Define how to find the value returned by a function.  */
1800
1801 static rtx
1802 aarch64_function_value (const_tree type, const_tree func,
1803                         bool outgoing ATTRIBUTE_UNUSED)
1804 {
1805   machine_mode mode;
1806   int unsignedp;
1807   int count;
1808   machine_mode ag_mode;
1809
1810   mode = TYPE_MODE (type);
1811   if (INTEGRAL_TYPE_P (type))
1812     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1813
1814   if (aarch64_return_in_msb (type))
1815     {
1816       HOST_WIDE_INT size = int_size_in_bytes (type);
1817
1818       if (size % UNITS_PER_WORD != 0)
1819         {
1820           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1821           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1822         }
1823     }
1824
1825   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1826                                                &ag_mode, &count, NULL))
1827     {
1828       if (!aarch64_composite_type_p (type, mode))
1829         {
1830           gcc_assert (count == 1 && mode == ag_mode);
1831           return gen_rtx_REG (mode, V0_REGNUM);
1832         }
1833       else
1834         {
1835           int i;
1836           rtx par;
1837
1838           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1839           for (i = 0; i < count; i++)
1840             {
1841               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1842               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1843                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1844               XVECEXP (par, 0, i) = tmp;
1845             }
1846           return par;
1847         }
1848     }
1849   else
1850     return gen_rtx_REG (mode, R0_REGNUM);
1851 }
1852
1853 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1854    Return true if REGNO is the number of a hard register in which the values
1855    of called function may come back.  */
1856
1857 static bool
1858 aarch64_function_value_regno_p (const unsigned int regno)
1859 {
1860   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1861      of 16-byte return values are: 128-bit integers and 16-byte small
1862      structures (excluding homogeneous floating-point aggregates).  */
1863   if (regno == R0_REGNUM || regno == R1_REGNUM)
1864     return true;
1865
1866   /* Up to four fp/simd registers can return a function value, e.g. a
1867      homogeneous floating-point aggregate having four members.  */
1868   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1869     return TARGET_FLOAT;
1870
1871   return false;
1872 }
1873
1874 /* Implement TARGET_RETURN_IN_MEMORY.
1875
1876    If the type T of the result of a function is such that
1877      void func (T arg)
1878    would require that arg be passed as a value in a register (or set of
1879    registers) according to the parameter passing rules, then the result
1880    is returned in the same registers as would be used for such an
1881    argument.  */
1882
1883 static bool
1884 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1885 {
1886   HOST_WIDE_INT size;
1887   machine_mode ag_mode;
1888   int count;
1889
1890   if (!AGGREGATE_TYPE_P (type)
1891       && TREE_CODE (type) != COMPLEX_TYPE
1892       && TREE_CODE (type) != VECTOR_TYPE)
1893     /* Simple scalar types always returned in registers.  */
1894     return false;
1895
1896   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1897                                                type,
1898                                                &ag_mode,
1899                                                &count,
1900                                                NULL))
1901     return false;
1902
1903   /* Types larger than 2 registers returned in memory.  */
1904   size = int_size_in_bytes (type);
1905   return (size < 0 || size > 2 * UNITS_PER_WORD);
1906 }
1907
1908 static bool
1909 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1910                                const_tree type, int *nregs)
1911 {
1912   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1913   return aarch64_vfp_is_call_or_return_candidate (mode,
1914                                                   type,
1915                                                   &pcum->aapcs_vfp_rmode,
1916                                                   nregs,
1917                                                   NULL);
1918 }
1919
1920 /* Given MODE and TYPE of a function argument, return the alignment in
1921    bits.  The idea is to suppress any stronger alignment requested by
1922    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1923    This is a helper function for local use only.  */
1924
1925 static unsigned int
1926 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1927 {
1928   unsigned int alignment;
1929
1930   if (type)
1931     {
1932       if (!integer_zerop (TYPE_SIZE (type)))
1933         {
1934           if (TYPE_MODE (type) == mode)
1935             alignment = TYPE_ALIGN (type);
1936           else
1937             alignment = GET_MODE_ALIGNMENT (mode);
1938         }
1939       else
1940         alignment = 0;
1941     }
1942   else
1943     alignment = GET_MODE_ALIGNMENT (mode);
1944
1945   return alignment;
1946 }
1947
1948 /* Layout a function argument according to the AAPCS64 rules.  The rule
1949    numbers refer to the rule numbers in the AAPCS64.  */
1950
1951 static void
1952 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1953                     const_tree type,
1954                     bool named ATTRIBUTE_UNUSED)
1955 {
1956   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1957   int ncrn, nvrn, nregs;
1958   bool allocate_ncrn, allocate_nvrn;
1959   HOST_WIDE_INT size;
1960
1961   /* We need to do this once per argument.  */
1962   if (pcum->aapcs_arg_processed)
1963     return;
1964
1965   pcum->aapcs_arg_processed = true;
1966
1967   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1968   size
1969     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1970                 UNITS_PER_WORD);
1971
1972   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1973   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1974                                                  mode,
1975                                                  type,
1976                                                  &nregs);
1977
1978   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1979      The following code thus handles passing by SIMD/FP registers first.  */
1980
1981   nvrn = pcum->aapcs_nvrn;
1982
1983   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1984      and homogenous short-vector aggregates (HVA).  */
1985   if (allocate_nvrn)
1986     {
1987       if (!TARGET_FLOAT)
1988         aarch64_err_no_fpadvsimd (mode, "argument");
1989
1990       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1991         {
1992           pcum->aapcs_nextnvrn = nvrn + nregs;
1993           if (!aarch64_composite_type_p (type, mode))
1994             {
1995               gcc_assert (nregs == 1);
1996               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1997             }
1998           else
1999             {
2000               rtx par;
2001               int i;
2002               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2003               for (i = 0; i < nregs; i++)
2004                 {
2005                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2006                                          V0_REGNUM + nvrn + i);
2007                   tmp = gen_rtx_EXPR_LIST
2008                     (VOIDmode, tmp,
2009                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2010                   XVECEXP (par, 0, i) = tmp;
2011                 }
2012               pcum->aapcs_reg = par;
2013             }
2014           return;
2015         }
2016       else
2017         {
2018           /* C.3 NSRN is set to 8.  */
2019           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2020           goto on_stack;
2021         }
2022     }
2023
2024   ncrn = pcum->aapcs_ncrn;
2025   nregs = size / UNITS_PER_WORD;
2026
2027   /* C6 - C9.  though the sign and zero extension semantics are
2028      handled elsewhere.  This is the case where the argument fits
2029      entirely general registers.  */
2030   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2031     {
2032       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2033
2034       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2035
2036       /* C.8 if the argument has an alignment of 16 then the NGRN is
2037          rounded up to the next even number.  */
2038       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2039         {
2040           ++ncrn;
2041           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2042         }
2043       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2044          A reg is still generated for it, but the caller should be smart
2045          enough not to use it.  */
2046       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2047         {
2048           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2049         }
2050       else
2051         {
2052           rtx par;
2053           int i;
2054
2055           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2056           for (i = 0; i < nregs; i++)
2057             {
2058               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2059               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2060                                        GEN_INT (i * UNITS_PER_WORD));
2061               XVECEXP (par, 0, i) = tmp;
2062             }
2063           pcum->aapcs_reg = par;
2064         }
2065
2066       pcum->aapcs_nextncrn = ncrn + nregs;
2067       return;
2068     }
2069
2070   /* C.11  */
2071   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2072
2073   /* The argument is passed on stack; record the needed number of words for
2074      this argument and align the total size if necessary.  */
2075 on_stack:
2076   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2077   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2078     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2079                                        16 / UNITS_PER_WORD);
2080   return;
2081 }
2082
2083 /* Implement TARGET_FUNCTION_ARG.  */
2084
2085 static rtx
2086 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2087                       const_tree type, bool named)
2088 {
2089   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2090   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2091
2092   if (mode == VOIDmode)
2093     return NULL_RTX;
2094
2095   aarch64_layout_arg (pcum_v, mode, type, named);
2096   return pcum->aapcs_reg;
2097 }
2098
2099 void
2100 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2101                            const_tree fntype ATTRIBUTE_UNUSED,
2102                            rtx libname ATTRIBUTE_UNUSED,
2103                            const_tree fndecl ATTRIBUTE_UNUSED,
2104                            unsigned n_named ATTRIBUTE_UNUSED)
2105 {
2106   pcum->aapcs_ncrn = 0;
2107   pcum->aapcs_nvrn = 0;
2108   pcum->aapcs_nextncrn = 0;
2109   pcum->aapcs_nextnvrn = 0;
2110   pcum->pcs_variant = ARM_PCS_AAPCS64;
2111   pcum->aapcs_reg = NULL_RTX;
2112   pcum->aapcs_arg_processed = false;
2113   pcum->aapcs_stack_words = 0;
2114   pcum->aapcs_stack_size = 0;
2115
2116   if (!TARGET_FLOAT
2117       && fndecl && TREE_PUBLIC (fndecl)
2118       && fntype && fntype != error_mark_node)
2119     {
2120       const_tree type = TREE_TYPE (fntype);
2121       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2122       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2123       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2124                                                    &mode, &nregs, NULL))
2125         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2126     }
2127   return;
2128 }
2129
2130 static void
2131 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2132                               machine_mode mode,
2133                               const_tree type,
2134                               bool named)
2135 {
2136   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2137   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2138     {
2139       aarch64_layout_arg (pcum_v, mode, type, named);
2140       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2141                   != (pcum->aapcs_stack_words != 0));
2142       pcum->aapcs_arg_processed = false;
2143       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2144       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2145       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2146       pcum->aapcs_stack_words = 0;
2147       pcum->aapcs_reg = NULL_RTX;
2148     }
2149 }
2150
2151 bool
2152 aarch64_function_arg_regno_p (unsigned regno)
2153 {
2154   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2155           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2156 }
2157
2158 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2159    PARM_BOUNDARY bits of alignment, but will be given anything up
2160    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2161    that both before and after the layout of each argument, the Next
2162    Stacked Argument Address (NSAA) will have a minimum alignment of
2163    8 bytes.  */
2164
2165 static unsigned int
2166 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2167 {
2168   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2169
2170   if (alignment < PARM_BOUNDARY)
2171     alignment = PARM_BOUNDARY;
2172   if (alignment > STACK_BOUNDARY)
2173     alignment = STACK_BOUNDARY;
2174   return alignment;
2175 }
2176
2177 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2178
2179    Return true if an argument passed on the stack should be padded upwards,
2180    i.e. if the least-significant byte of the stack slot has useful data.
2181
2182    Small aggregate types are placed in the lowest memory address.
2183
2184    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2185
2186 bool
2187 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2188 {
2189   /* On little-endian targets, the least significant byte of every stack
2190      argument is passed at the lowest byte address of the stack slot.  */
2191   if (!BYTES_BIG_ENDIAN)
2192     return true;
2193
2194   /* Otherwise, integral, floating-point and pointer types are padded downward:
2195      the least significant byte of a stack argument is passed at the highest
2196      byte address of the stack slot.  */
2197   if (type
2198       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2199          || POINTER_TYPE_P (type))
2200       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2201     return false;
2202
2203   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2204   return true;
2205 }
2206
2207 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2208
2209    It specifies padding for the last (may also be the only)
2210    element of a block move between registers and memory.  If
2211    assuming the block is in the memory, padding upward means that
2212    the last element is padded after its highest significant byte,
2213    while in downward padding, the last element is padded at the
2214    its least significant byte side.
2215
2216    Small aggregates and small complex types are always padded
2217    upwards.
2218
2219    We don't need to worry about homogeneous floating-point or
2220    short-vector aggregates; their move is not affected by the
2221    padding direction determined here.  Regardless of endianness,
2222    each element of such an aggregate is put in the least
2223    significant bits of a fp/simd register.
2224
2225    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2226    register has useful data, and return the opposite if the most
2227    significant byte does.  */
2228
2229 bool
2230 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2231                      bool first ATTRIBUTE_UNUSED)
2232 {
2233
2234   /* Small composite types are always padded upward.  */
2235   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2236     {
2237       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2238                             : GET_MODE_SIZE (mode));
2239       if (size < 2 * UNITS_PER_WORD)
2240         return true;
2241     }
2242
2243   /* Otherwise, use the default padding.  */
2244   return !BYTES_BIG_ENDIAN;
2245 }
2246
2247 static machine_mode
2248 aarch64_libgcc_cmp_return_mode (void)
2249 {
2250   return SImode;
2251 }
2252
2253 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2254
2255 /* We use the 12-bit shifted immediate arithmetic instructions so values
2256    must be multiple of (1 << 12), i.e. 4096.  */
2257 #define ARITH_FACTOR 4096
2258
2259 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2260 #error Cannot use simple address calculation for stack probing
2261 #endif
2262
2263 /* The pair of scratch registers used for stack probing.  */
2264 #define PROBE_STACK_FIRST_REG  9
2265 #define PROBE_STACK_SECOND_REG 10
2266
2267 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2268    inclusive.  These are offsets from the current stack pointer.  */
2269
2270 static void
2271 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2272 {
2273   rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2274
2275   /* See the same assertion on PROBE_INTERVAL above.  */
2276   gcc_assert ((first % ARITH_FACTOR) == 0);
2277
2278   /* See if we have a constant small number of probes to generate.  If so,
2279      that's the easy case.  */
2280   if (size <= PROBE_INTERVAL)
2281     {
2282       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2283
2284       emit_set_insn (reg1,
2285                      plus_constant (ptr_mode,
2286                                     stack_pointer_rtx, -(first + base)));
2287       emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2288     }
2289
2290   /* The run-time loop is made up of 8 insns in the generic case while the
2291      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2292   else if (size <= 4 * PROBE_INTERVAL)
2293     {
2294       HOST_WIDE_INT i, rem;
2295
2296       emit_set_insn (reg1,
2297                      plus_constant (ptr_mode,
2298                                     stack_pointer_rtx,
2299                                     -(first + PROBE_INTERVAL)));
2300       emit_stack_probe (reg1);
2301
2302       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2303          it exceeds SIZE.  If only two probes are needed, this will not
2304          generate any code.  Then probe at FIRST + SIZE.  */
2305       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2306         {
2307           emit_set_insn (reg1,
2308                          plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2309           emit_stack_probe (reg1);
2310         }
2311
2312       rem = size - (i - PROBE_INTERVAL);
2313       if (rem > 256)
2314         {
2315           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2316
2317           emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2318           emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2319         }
2320       else
2321         emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2322     }
2323
2324   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2325      extra careful with variables wrapping around because we might be at
2326      the very top (or the very bottom) of the address space and we have
2327      to be able to handle this case properly; in particular, we use an
2328      equality test for the loop condition.  */
2329   else
2330     {
2331       rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2332
2333       /* Step 1: round SIZE to the previous multiple of the interval.  */
2334
2335       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2336
2337
2338       /* Step 2: compute initial and final value of the loop counter.  */
2339
2340       /* TEST_ADDR = SP + FIRST.  */
2341       emit_set_insn (reg1,
2342                      plus_constant (ptr_mode, stack_pointer_rtx, -first));
2343
2344       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2345       emit_set_insn (reg2,
2346                      plus_constant (ptr_mode, stack_pointer_rtx,
2347                                     -(first + rounded_size)));
2348
2349
2350       /* Step 3: the loop
2351
2352          do
2353            {
2354              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2355              probe at TEST_ADDR
2356            }
2357          while (TEST_ADDR != LAST_ADDR)
2358
2359          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2360          until it is equal to ROUNDED_SIZE.  */
2361
2362       if (ptr_mode == DImode)
2363         emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2364       else
2365         emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2366
2367
2368       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2369          that SIZE is equal to ROUNDED_SIZE.  */
2370
2371       if (size != rounded_size)
2372         {
2373           HOST_WIDE_INT rem = size - rounded_size;
2374
2375           if (rem > 256)
2376             {
2377               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2378
2379               emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2380               emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2381             }
2382           else
2383             emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2384         }
2385     }
2386
2387   /* Make sure nothing is scheduled before we are done.  */
2388   emit_insn (gen_blockage ());
2389 }
2390
2391 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2392    absolute addresses.  */
2393
2394 const char *
2395 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2396 {
2397   static int labelno = 0;
2398   char loop_lab[32];
2399   rtx xops[2];
2400
2401   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2402
2403   /* Loop.  */
2404   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2405
2406   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2407   xops[0] = reg1;
2408   xops[1] = GEN_INT (PROBE_INTERVAL);
2409   output_asm_insn ("sub\t%0, %0, %1", xops);
2410
2411   /* Probe at TEST_ADDR.  */
2412   output_asm_insn ("str\txzr, [%0]", xops);
2413
2414   /* Test if TEST_ADDR == LAST_ADDR.  */
2415   xops[1] = reg2;
2416   output_asm_insn ("cmp\t%0, %1", xops);
2417
2418   /* Branch.  */
2419   fputs ("\tb.ne\t", asm_out_file);
2420   assemble_name_raw (asm_out_file, loop_lab);
2421   fputc ('\n', asm_out_file);
2422
2423   return "";
2424 }
2425
2426 static bool
2427 aarch64_frame_pointer_required (void)
2428 {
2429   /* In aarch64_override_options_after_change
2430      flag_omit_leaf_frame_pointer turns off the frame pointer by
2431      default.  Turn it back on now if we've not got a leaf
2432      function.  */
2433   if (flag_omit_leaf_frame_pointer
2434       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2435     return true;
2436
2437   return false;
2438 }
2439
2440 /* Mark the registers that need to be saved by the callee and calculate
2441    the size of the callee-saved registers area and frame record (both FP
2442    and LR may be omitted).  */
2443 static void
2444 aarch64_layout_frame (void)
2445 {
2446   HOST_WIDE_INT offset = 0;
2447   int regno;
2448
2449   if (reload_completed && cfun->machine->frame.laid_out)
2450     return;
2451
2452 #define SLOT_NOT_REQUIRED (-2)
2453 #define SLOT_REQUIRED     (-1)
2454
2455   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2456   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2457
2458   /* First mark all the registers that really need to be saved...  */
2459   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2460     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2461
2462   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2463     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2464
2465   /* ... that includes the eh data registers (if needed)...  */
2466   if (crtl->calls_eh_return)
2467     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2468       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2469         = SLOT_REQUIRED;
2470
2471   /* ... and any callee saved register that dataflow says is live.  */
2472   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2473     if (df_regs_ever_live_p (regno)
2474         && (regno == R30_REGNUM
2475             || !call_used_regs[regno]))
2476       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2477
2478   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2479     if (df_regs_ever_live_p (regno)
2480         && !call_used_regs[regno])
2481       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2482
2483   if (frame_pointer_needed)
2484     {
2485       /* FP and LR are placed in the linkage record.  */
2486       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2487       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2488       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2489       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2490       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2491       offset += 2 * UNITS_PER_WORD;
2492     }
2493
2494   /* Now assign stack slots for them.  */
2495   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2496     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2497       {
2498         cfun->machine->frame.reg_offset[regno] = offset;
2499         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2500           cfun->machine->frame.wb_candidate1 = regno;
2501         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2502           cfun->machine->frame.wb_candidate2 = regno;
2503         offset += UNITS_PER_WORD;
2504       }
2505
2506   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2507     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2508       {
2509         cfun->machine->frame.reg_offset[regno] = offset;
2510         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2511           cfun->machine->frame.wb_candidate1 = regno;
2512         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2513                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2514           cfun->machine->frame.wb_candidate2 = regno;
2515         offset += UNITS_PER_WORD;
2516       }
2517
2518   cfun->machine->frame.padding0 =
2519     (ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2520   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2521
2522   cfun->machine->frame.saved_regs_size = offset;
2523
2524   cfun->machine->frame.hard_fp_offset
2525     = ROUND_UP (cfun->machine->frame.saved_varargs_size
2526                 + get_frame_size ()
2527                 + cfun->machine->frame.saved_regs_size,
2528                 STACK_BOUNDARY / BITS_PER_UNIT);
2529
2530   cfun->machine->frame.frame_size
2531     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2532                 + crtl->outgoing_args_size,
2533                 STACK_BOUNDARY / BITS_PER_UNIT);
2534
2535   cfun->machine->frame.laid_out = true;
2536 }
2537
2538 static bool
2539 aarch64_register_saved_on_entry (int regno)
2540 {
2541   return cfun->machine->frame.reg_offset[regno] >= 0;
2542 }
2543
2544 static unsigned
2545 aarch64_next_callee_save (unsigned regno, unsigned limit)
2546 {
2547   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2548     regno ++;
2549   return regno;
2550 }
2551
2552 static void
2553 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2554                            HOST_WIDE_INT adjustment)
2555  {
2556   rtx base_rtx = stack_pointer_rtx;
2557   rtx insn, reg, mem;
2558
2559   reg = gen_rtx_REG (mode, regno);
2560   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2561                             plus_constant (Pmode, base_rtx, -adjustment));
2562   mem = gen_rtx_MEM (mode, mem);
2563
2564   insn = emit_move_insn (mem, reg);
2565   RTX_FRAME_RELATED_P (insn) = 1;
2566 }
2567
2568 static rtx
2569 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2570                           HOST_WIDE_INT adjustment)
2571 {
2572   switch (mode)
2573     {
2574     case DImode:
2575       return gen_storewb_pairdi_di (base, base, reg, reg2,
2576                                     GEN_INT (-adjustment),
2577                                     GEN_INT (UNITS_PER_WORD - adjustment));
2578     case DFmode:
2579       return gen_storewb_pairdf_di (base, base, reg, reg2,
2580                                     GEN_INT (-adjustment),
2581                                     GEN_INT (UNITS_PER_WORD - adjustment));
2582     default:
2583       gcc_unreachable ();
2584     }
2585 }
2586
2587 static void
2588 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2589                          unsigned regno2, HOST_WIDE_INT adjustment)
2590 {
2591   rtx_insn *insn;
2592   rtx reg1 = gen_rtx_REG (mode, regno1);
2593   rtx reg2 = gen_rtx_REG (mode, regno2);
2594
2595   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2596                                               reg2, adjustment));
2597   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2598   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2599   RTX_FRAME_RELATED_P (insn) = 1;
2600 }
2601
2602 static rtx
2603 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2604                          HOST_WIDE_INT adjustment)
2605 {
2606   switch (mode)
2607     {
2608     case DImode:
2609       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2610                                    GEN_INT (UNITS_PER_WORD));
2611     case DFmode:
2612       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2613                                    GEN_INT (UNITS_PER_WORD));
2614     default:
2615       gcc_unreachable ();
2616     }
2617 }
2618
2619 static rtx
2620 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2621                         rtx reg2)
2622 {
2623   switch (mode)
2624     {
2625     case DImode:
2626       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2627
2628     case DFmode:
2629       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2630
2631     default:
2632       gcc_unreachable ();
2633     }
2634 }
2635
2636 static rtx
2637 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2638                        rtx mem2)
2639 {
2640   switch (mode)
2641     {
2642     case DImode:
2643       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2644
2645     case DFmode:
2646       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2647
2648     default:
2649       gcc_unreachable ();
2650     }
2651 }
2652
2653
2654 static void
2655 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2656                            unsigned start, unsigned limit, bool skip_wb)
2657 {
2658   rtx_insn *insn;
2659   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2660                                                  ? gen_frame_mem : gen_rtx_MEM);
2661   unsigned regno;
2662   unsigned regno2;
2663
2664   for (regno = aarch64_next_callee_save (start, limit);
2665        regno <= limit;
2666        regno = aarch64_next_callee_save (regno + 1, limit))
2667     {
2668       rtx reg, mem;
2669       HOST_WIDE_INT offset;
2670
2671       if (skip_wb
2672           && (regno == cfun->machine->frame.wb_candidate1
2673               || regno == cfun->machine->frame.wb_candidate2))
2674         continue;
2675
2676       reg = gen_rtx_REG (mode, regno);
2677       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2678       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2679                                               offset));
2680
2681       regno2 = aarch64_next_callee_save (regno + 1, limit);
2682
2683       if (regno2 <= limit
2684           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2685               == cfun->machine->frame.reg_offset[regno2]))
2686
2687         {
2688           rtx reg2 = gen_rtx_REG (mode, regno2);
2689           rtx mem2;
2690
2691           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2692           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2693                                                    offset));
2694           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2695                                                     reg2));
2696
2697           /* The first part of a frame-related parallel insn is
2698              always assumed to be relevant to the frame
2699              calculations; subsequent parts, are only
2700              frame-related if explicitly marked.  */
2701           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2702           regno = regno2;
2703         }
2704       else
2705         insn = emit_move_insn (mem, reg);
2706
2707       RTX_FRAME_RELATED_P (insn) = 1;
2708     }
2709 }
2710
2711 static void
2712 aarch64_restore_callee_saves (machine_mode mode,
2713                               HOST_WIDE_INT start_offset, unsigned start,
2714                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2715 {
2716   rtx base_rtx = stack_pointer_rtx;
2717   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2718                                                  ? gen_frame_mem : gen_rtx_MEM);
2719   unsigned regno;
2720   unsigned regno2;
2721   HOST_WIDE_INT offset;
2722
2723   for (regno = aarch64_next_callee_save (start, limit);
2724        regno <= limit;
2725        regno = aarch64_next_callee_save (regno + 1, limit))
2726     {
2727       rtx reg, mem;
2728
2729       if (skip_wb
2730           && (regno == cfun->machine->frame.wb_candidate1
2731               || regno == cfun->machine->frame.wb_candidate2))
2732         continue;
2733
2734       reg = gen_rtx_REG (mode, regno);
2735       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2736       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2737
2738       regno2 = aarch64_next_callee_save (regno + 1, limit);
2739
2740       if (regno2 <= limit
2741           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2742               == cfun->machine->frame.reg_offset[regno2]))
2743         {
2744           rtx reg2 = gen_rtx_REG (mode, regno2);
2745           rtx mem2;
2746
2747           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2748           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2749           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2750
2751           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2752           regno = regno2;
2753         }
2754       else
2755         emit_move_insn (reg, mem);
2756       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2757     }
2758 }
2759
2760 /* AArch64 stack frames generated by this compiler look like:
2761
2762         +-------------------------------+
2763         |                               |
2764         |  incoming stack arguments     |
2765         |                               |
2766         +-------------------------------+
2767         |                               | <-- incoming stack pointer (aligned)
2768         |  callee-allocated save area   |
2769         |  for register varargs         |
2770         |                               |
2771         +-------------------------------+
2772         |  local variables              | <-- frame_pointer_rtx
2773         |                               |
2774         +-------------------------------+
2775         |  padding0                     | \
2776         +-------------------------------+  |
2777         |  callee-saved registers       |  | frame.saved_regs_size
2778         +-------------------------------+  |
2779         |  LR'                          |  |
2780         +-------------------------------+  |
2781         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2782         +-------------------------------+
2783         |  dynamic allocation           |
2784         +-------------------------------+
2785         |  padding                      |
2786         +-------------------------------+
2787         |  outgoing stack arguments     | <-- arg_pointer
2788         |                               |
2789         +-------------------------------+
2790         |                               | <-- stack_pointer_rtx (aligned)
2791
2792    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2793    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2794    unchanged.  */
2795
2796 /* Generate the prologue instructions for entry into a function.
2797    Establish the stack frame by decreasing the stack pointer with a
2798    properly calculated size and, if necessary, create a frame record
2799    filled with the values of LR and previous frame pointer.  The
2800    current FP is also set up if it is in use.  */
2801
2802 void
2803 aarch64_expand_prologue (void)
2804 {
2805   /* sub sp, sp, #<frame_size>
2806      stp {fp, lr}, [sp, #<frame_size> - 16]
2807      add fp, sp, #<frame_size> - hardfp_offset
2808      stp {cs_reg}, [fp, #-16] etc.
2809
2810      sub sp, sp, <final_adjustment_if_any>
2811   */
2812   HOST_WIDE_INT frame_size, offset;
2813   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2814   HOST_WIDE_INT hard_fp_offset;
2815   rtx_insn *insn;
2816
2817   aarch64_layout_frame ();
2818
2819   offset = frame_size = cfun->machine->frame.frame_size;
2820   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2821   fp_offset = frame_size - hard_fp_offset;
2822
2823   if (flag_stack_usage_info)
2824     current_function_static_stack_size = frame_size;
2825
2826   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
2827     {
2828       if (crtl->is_leaf && !cfun->calls_alloca)
2829         {
2830           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
2831             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
2832                                             frame_size - STACK_CHECK_PROTECT);
2833         }
2834       else if (frame_size > 0)
2835         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
2836     }
2837
2838   /* Store pairs and load pairs have a range only -512 to 504.  */
2839   if (offset >= 512)
2840     {
2841       /* When the frame has a large size, an initial decrease is done on
2842          the stack pointer to jump over the callee-allocated save area for
2843          register varargs, the local variable area and/or the callee-saved
2844          register area.  This will allow the pre-index write-back
2845          store pair instructions to be used for setting up the stack frame
2846          efficiently.  */
2847       offset = hard_fp_offset;
2848       if (offset >= 512)
2849         offset = cfun->machine->frame.saved_regs_size;
2850
2851       frame_size -= (offset + crtl->outgoing_args_size);
2852       fp_offset = 0;
2853
2854       if (frame_size >= 0x1000000)
2855         {
2856           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2857           emit_move_insn (op0, GEN_INT (-frame_size));
2858           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2859
2860           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2861                         gen_rtx_SET (stack_pointer_rtx,
2862                                      plus_constant (Pmode, stack_pointer_rtx,
2863                                                     -frame_size)));
2864           RTX_FRAME_RELATED_P (insn) = 1;
2865         }
2866       else if (frame_size > 0)
2867         {
2868           int hi_ofs = frame_size & 0xfff000;
2869           int lo_ofs = frame_size & 0x000fff;
2870
2871           if (hi_ofs)
2872             {
2873               insn = emit_insn (gen_add2_insn
2874                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2875               RTX_FRAME_RELATED_P (insn) = 1;
2876             }
2877           if (lo_ofs)
2878             {
2879               insn = emit_insn (gen_add2_insn
2880                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2881               RTX_FRAME_RELATED_P (insn) = 1;
2882             }
2883         }
2884     }
2885   else
2886     frame_size = -1;
2887
2888   if (offset > 0)
2889     {
2890       bool skip_wb = false;
2891
2892       if (frame_pointer_needed)
2893         {
2894           skip_wb = true;
2895
2896           if (fp_offset)
2897             {
2898               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2899                                                GEN_INT (-offset)));
2900               RTX_FRAME_RELATED_P (insn) = 1;
2901
2902               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2903                                          R30_REGNUM, false);
2904             }
2905           else
2906             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2907
2908           /* Set up frame pointer to point to the location of the
2909              previous frame pointer on the stack.  */
2910           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2911                                            stack_pointer_rtx,
2912                                            GEN_INT (fp_offset)));
2913           RTX_FRAME_RELATED_P (insn) = 1;
2914           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2915         }
2916       else
2917         {
2918           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2919           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2920
2921           if (fp_offset
2922               || reg1 == FIRST_PSEUDO_REGISTER
2923               || (reg2 == FIRST_PSEUDO_REGISTER
2924                   && offset >= 256))
2925             {
2926               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2927                                                GEN_INT (-offset)));
2928               RTX_FRAME_RELATED_P (insn) = 1;
2929             }
2930           else
2931             {
2932               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2933
2934               skip_wb = true;
2935
2936               if (reg2 == FIRST_PSEUDO_REGISTER)
2937                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2938               else
2939                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2940             }
2941         }
2942
2943       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2944                                  skip_wb);
2945       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2946                                  skip_wb);
2947     }
2948
2949   /* when offset >= 512,
2950      sub sp, sp, #<outgoing_args_size> */
2951   if (frame_size > -1)
2952     {
2953       if (crtl->outgoing_args_size > 0)
2954         {
2955           insn = emit_insn (gen_add2_insn
2956                             (stack_pointer_rtx,
2957                              GEN_INT (- crtl->outgoing_args_size)));
2958           RTX_FRAME_RELATED_P (insn) = 1;
2959         }
2960     }
2961 }
2962
2963 /* Return TRUE if we can use a simple_return insn.
2964
2965    This function checks whether the callee saved stack is empty, which
2966    means no restore actions are need. The pro_and_epilogue will use
2967    this to check whether shrink-wrapping opt is feasible.  */
2968
2969 bool
2970 aarch64_use_return_insn_p (void)
2971 {
2972   if (!reload_completed)
2973     return false;
2974
2975   if (crtl->profile)
2976     return false;
2977
2978   aarch64_layout_frame ();
2979
2980   return cfun->machine->frame.frame_size == 0;
2981 }
2982
2983 /* Generate the epilogue instructions for returning from a function.  */
2984 void
2985 aarch64_expand_epilogue (bool for_sibcall)
2986 {
2987   HOST_WIDE_INT frame_size, offset;
2988   HOST_WIDE_INT fp_offset;
2989   HOST_WIDE_INT hard_fp_offset;
2990   rtx_insn *insn;
2991   /* We need to add memory barrier to prevent read from deallocated stack.  */
2992   bool need_barrier_p = (get_frame_size () != 0
2993                          || cfun->machine->frame.saved_varargs_size);
2994
2995   aarch64_layout_frame ();
2996
2997   offset = frame_size = cfun->machine->frame.frame_size;
2998   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2999   fp_offset = frame_size - hard_fp_offset;
3000
3001   /* Store pairs and load pairs have a range only -512 to 504.  */
3002   if (offset >= 512)
3003     {
3004       offset = hard_fp_offset;
3005       if (offset >= 512)
3006         offset = cfun->machine->frame.saved_regs_size;
3007
3008       frame_size -= (offset + crtl->outgoing_args_size);
3009       fp_offset = 0;
3010       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
3011         {
3012           insn = emit_insn (gen_add2_insn
3013                             (stack_pointer_rtx,
3014                              GEN_INT (crtl->outgoing_args_size)));
3015           RTX_FRAME_RELATED_P (insn) = 1;
3016         }
3017     }
3018   else
3019     frame_size = -1;
3020
3021   /* If there were outgoing arguments or we've done dynamic stack
3022      allocation, then restore the stack pointer from the frame
3023      pointer.  This is at most one insn and more efficient than using
3024      GCC's internal mechanism.  */
3025   if (frame_pointer_needed
3026       && (crtl->outgoing_args_size || cfun->calls_alloca))
3027     {
3028       if (cfun->calls_alloca)
3029         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3030
3031       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3032                                        hard_frame_pointer_rtx,
3033                                        GEN_INT (0)));
3034       offset = offset - fp_offset;
3035     }
3036
3037   if (offset > 0)
3038     {
3039       unsigned reg1 = cfun->machine->frame.wb_candidate1;
3040       unsigned reg2 = cfun->machine->frame.wb_candidate2;
3041       bool skip_wb = true;
3042       rtx cfi_ops = NULL;
3043
3044       if (frame_pointer_needed)
3045         fp_offset = 0;
3046       else if (fp_offset
3047                || reg1 == FIRST_PSEUDO_REGISTER
3048                || (reg2 == FIRST_PSEUDO_REGISTER
3049                    && offset >= 256))
3050         skip_wb = false;
3051
3052       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
3053                                     skip_wb, &cfi_ops);
3054       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
3055                                     skip_wb, &cfi_ops);
3056
3057       if (need_barrier_p)
3058         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3059
3060       if (skip_wb)
3061         {
3062           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
3063           rtx rreg1 = gen_rtx_REG (mode1, reg1);
3064
3065           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
3066           if (reg2 == FIRST_PSEUDO_REGISTER)
3067             {
3068               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
3069               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3070               mem = gen_rtx_MEM (mode1, mem);
3071               insn = emit_move_insn (rreg1, mem);
3072             }
3073           else
3074             {
3075               rtx rreg2 = gen_rtx_REG (mode1, reg2);
3076
3077               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
3078               insn = emit_insn (aarch64_gen_loadwb_pair
3079                                 (mode1, stack_pointer_rtx, rreg1,
3080                                  rreg2, offset));
3081             }
3082         }
3083       else
3084         {
3085           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
3086                                            GEN_INT (offset)));
3087         }
3088
3089       /* Reset the CFA to be SP + FRAME_SIZE.  */
3090       rtx new_cfa = stack_pointer_rtx;
3091       if (frame_size > 0)
3092         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
3093       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3094       REG_NOTES (insn) = cfi_ops;
3095       RTX_FRAME_RELATED_P (insn) = 1;
3096     }
3097
3098   if (frame_size > 0)
3099     {
3100       if (need_barrier_p)
3101         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3102
3103       if (frame_size >= 0x1000000)
3104         {
3105           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3106           emit_move_insn (op0, GEN_INT (frame_size));
3107           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
3108         }
3109       else
3110         {
3111           int hi_ofs = frame_size & 0xfff000;
3112           int lo_ofs = frame_size & 0x000fff;
3113
3114           if (hi_ofs && lo_ofs)
3115             {
3116               insn = emit_insn (gen_add2_insn
3117                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
3118               RTX_FRAME_RELATED_P (insn) = 1;
3119               frame_size = lo_ofs;
3120             }
3121           insn = emit_insn (gen_add2_insn
3122                             (stack_pointer_rtx, GEN_INT (frame_size)));
3123         }
3124
3125       /* Reset the CFA to be SP + 0.  */
3126       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
3127       RTX_FRAME_RELATED_P (insn) = 1;
3128     }
3129
3130   /* Stack adjustment for exception handler.  */
3131   if (crtl->calls_eh_return)
3132     {
3133       /* We need to unwind the stack by the offset computed by
3134          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3135          to be SP; letting the CFA move during this adjustment
3136          is just as correct as retaining the CFA from the body
3137          of the function.  Therefore, do nothing special.  */
3138       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3139     }
3140
3141   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3142   if (!for_sibcall)
3143     emit_jump_insn (ret_rtx);
3144 }
3145
3146 /* Return the place to copy the exception unwinding return address to.
3147    This will probably be a stack slot, but could (in theory be the
3148    return register).  */
3149 rtx
3150 aarch64_final_eh_return_addr (void)
3151 {
3152   HOST_WIDE_INT fp_offset;
3153
3154   aarch64_layout_frame ();
3155
3156   fp_offset = cfun->machine->frame.frame_size
3157               - cfun->machine->frame.hard_fp_offset;
3158
3159   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
3160     return gen_rtx_REG (DImode, LR_REGNUM);
3161
3162   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
3163      result in a store to save LR introduced by builtin_eh_return () being
3164      incorrectly deleted because the alias is not detected.
3165      So in the calculation of the address to copy the exception unwinding
3166      return address to, we note 2 cases.
3167      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
3168      we return a SP-relative location since all the addresses are SP-relative
3169      in this case.  This prevents the store from being optimized away.
3170      If the fp_offset is not 0, then the addresses will be FP-relative and
3171      therefore we return a FP-relative location.  */
3172
3173   if (frame_pointer_needed)
3174     {
3175       if (fp_offset)
3176         return gen_frame_mem (DImode,
3177                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3178       else
3179         return gen_frame_mem (DImode,
3180                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
3181     }
3182
3183   /* If FP is not needed, we calculate the location of LR, which would be
3184      at the top of the saved registers block.  */
3185
3186   return gen_frame_mem (DImode,
3187                         plus_constant (Pmode,
3188                                        stack_pointer_rtx,
3189                                        fp_offset
3190                                        + cfun->machine->frame.saved_regs_size
3191                                        - 2 * UNITS_PER_WORD));
3192 }
3193
3194 /* Possibly output code to build up a constant in a register.  For
3195    the benefit of the costs infrastructure, returns the number of
3196    instructions which would be emitted.  GENERATE inhibits or
3197    enables code generation.  */
3198
3199 static int
3200 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
3201 {
3202   int insns = 0;
3203
3204   if (aarch64_bitmask_imm (val, DImode))
3205     {
3206       if (generate)
3207         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
3208       insns = 1;
3209     }
3210   else
3211     {
3212       int i;
3213       int ncount = 0;
3214       int zcount = 0;
3215       HOST_WIDE_INT valp = val >> 16;
3216       HOST_WIDE_INT valm;
3217       HOST_WIDE_INT tval;
3218
3219       for (i = 16; i < 64; i += 16)
3220         {
3221           valm = (valp & 0xffff);
3222
3223           if (valm != 0)
3224             ++ zcount;
3225
3226           if (valm != 0xffff)
3227             ++ ncount;
3228
3229           valp >>= 16;
3230         }
3231
3232       /* zcount contains the number of additional MOVK instructions
3233          required if the constant is built up with an initial MOVZ instruction,
3234          while ncount is the number of MOVK instructions required if starting
3235          with a MOVN instruction.  Choose the sequence that yields the fewest
3236          number of instructions, preferring MOVZ instructions when they are both
3237          the same.  */
3238       if (ncount < zcount)
3239         {
3240           if (generate)
3241             emit_move_insn (gen_rtx_REG (Pmode, regnum),
3242                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
3243           tval = 0xffff;
3244           insns++;
3245         }
3246       else
3247         {
3248           if (generate)
3249             emit_move_insn (gen_rtx_REG (Pmode, regnum),
3250                             GEN_INT (val & 0xffff));
3251           tval = 0;
3252           insns++;
3253         }
3254
3255       val >>= 16;
3256
3257       for (i = 16; i < 64; i += 16)
3258         {
3259           if ((val & 0xffff) != tval)
3260             {
3261               if (generate)
3262                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
3263                                            GEN_INT (i),
3264                                            GEN_INT (val & 0xffff)));
3265               insns++;
3266             }
3267           val >>= 16;
3268         }
3269     }
3270   return insns;
3271 }
3272
3273 static void
3274 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
3275 {
3276   HOST_WIDE_INT mdelta = delta;
3277   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
3278   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
3279
3280   if (mdelta < 0)
3281     mdelta = -mdelta;
3282
3283   if (mdelta >= 4096 * 4096)
3284     {
3285       (void) aarch64_build_constant (scratchreg, delta, true);
3286       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
3287     }
3288   else if (mdelta > 0)
3289     {
3290       if (mdelta >= 4096)
3291         {
3292           emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
3293           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
3294           if (delta < 0)
3295             emit_insn (gen_rtx_SET (this_rtx,
3296                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
3297           else
3298             emit_insn (gen_rtx_SET (this_rtx,
3299                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
3300         }
3301       if (mdelta % 4096 != 0)
3302         {
3303           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
3304           emit_insn (gen_rtx_SET (this_rtx,
3305                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
3306         }
3307     }
3308 }
3309
3310 /* Output code to add DELTA to the first argument, and then jump
3311    to FUNCTION.  Used for C++ multiple inheritance.  */
3312 static void
3313 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3314                          HOST_WIDE_INT delta,
3315                          HOST_WIDE_INT vcall_offset,
3316                          tree function)
3317 {
3318   /* The this pointer is always in x0.  Note that this differs from
3319      Arm where the this pointer maybe bumped to r1 if r0 is required
3320      to return a pointer to an aggregate.  On AArch64 a result value
3321      pointer will be in x8.  */
3322   int this_regno = R0_REGNUM;
3323   rtx this_rtx, temp0, temp1, addr, funexp;
3324   rtx_insn *insn;
3325
3326   reload_completed = 1;
3327   emit_note (NOTE_INSN_PROLOGUE_END);
3328
3329   if (vcall_offset == 0)
3330     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3331   else
3332     {
3333       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3334
3335       this_rtx = gen_rtx_REG (Pmode, this_regno);
3336       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3337       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3338
3339       addr = this_rtx;
3340       if (delta != 0)
3341         {
3342           if (delta >= -256 && delta < 256)
3343             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3344                                        plus_constant (Pmode, this_rtx, delta));
3345           else
3346             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3347         }
3348
3349       if (Pmode == ptr_mode)
3350         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3351       else
3352         aarch64_emit_move (temp0,
3353                            gen_rtx_ZERO_EXTEND (Pmode,
3354                                                 gen_rtx_MEM (ptr_mode, addr)));
3355
3356       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3357           addr = plus_constant (Pmode, temp0, vcall_offset);
3358       else
3359         {
3360           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
3361           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3362         }
3363
3364       if (Pmode == ptr_mode)
3365         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3366       else
3367         aarch64_emit_move (temp1,
3368                            gen_rtx_SIGN_EXTEND (Pmode,
3369                                                 gen_rtx_MEM (ptr_mode, addr)));
3370
3371       emit_insn (gen_add2_insn (this_rtx, temp1));
3372     }
3373
3374   /* Generate a tail call to the target function.  */
3375   if (!TREE_USED (function))
3376     {
3377       assemble_external (function);
3378       TREE_USED (function) = 1;
3379     }
3380   funexp = XEXP (DECL_RTL (function), 0);
3381   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3382   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3383   SIBLING_CALL_P (insn) = 1;
3384
3385   insn = get_insns ();
3386   shorten_branches (insn);
3387   final_start_function (insn, file, 1);
3388   final (insn, file, 1);
3389   final_end_function ();
3390
3391   /* Stop pretending to be a post-reload pass.  */
3392   reload_completed = 0;
3393 }
3394
3395 static bool
3396 aarch64_tls_referenced_p (rtx x)
3397 {
3398   if (!TARGET_HAVE_TLS)
3399     return false;
3400   subrtx_iterator::array_type array;
3401   FOR_EACH_SUBRTX (iter, array, x, ALL)
3402     {
3403       const_rtx x = *iter;
3404       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3405         return true;
3406       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3407          TLS offsets, not real symbol references.  */
3408       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3409         iter.skip_subrtxes ();
3410     }
3411   return false;
3412 }
3413
3414
3415 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3416    a left shift of 0 or 12 bits.  */
3417 bool
3418 aarch64_uimm12_shift (HOST_WIDE_INT val)
3419 {
3420   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3421           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3422           );
3423 }
3424
3425
3426 /* Return true if val is an immediate that can be loaded into a
3427    register by a MOVZ instruction.  */
3428 static bool
3429 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3430 {
3431   if (GET_MODE_SIZE (mode) > 4)
3432     {
3433       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3434           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3435         return 1;
3436     }
3437   else
3438     {
3439       /* Ignore sign extension.  */
3440       val &= (HOST_WIDE_INT) 0xffffffff;
3441     }
3442   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3443           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3444 }
3445
3446 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3447
3448 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3449   {
3450     0x0000000100000001ull,
3451     0x0001000100010001ull,
3452     0x0101010101010101ull,
3453     0x1111111111111111ull,
3454     0x5555555555555555ull,
3455   };
3456
3457
3458 /* Return true if val is a valid bitmask immediate.  */
3459
3460 bool
3461 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3462 {
3463   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3464   int bits;
3465
3466   /* Check for a single sequence of one bits and return quickly if so.
3467      The special cases of all ones and all zeroes returns false.  */
3468   val = (unsigned HOST_WIDE_INT) val_in;
3469   tmp = val + (val & -val);
3470
3471   if (tmp == (tmp & -tmp))
3472     return (val + 1) > 1;
3473
3474   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
3475   if (mode == SImode)
3476     val = (val << 32) | (val & 0xffffffff);
3477
3478   /* Invert if the immediate doesn't start with a zero bit - this means we
3479      only need to search for sequences of one bits.  */
3480   if (val & 1)
3481     val = ~val;
3482
3483   /* Find the first set bit and set tmp to val with the first sequence of one
3484      bits removed.  Return success if there is a single sequence of ones.  */
3485   first_one = val & -val;
3486   tmp = val & (val + first_one);
3487
3488   if (tmp == 0)
3489     return true;
3490
3491   /* Find the next set bit and compute the difference in bit position.  */
3492   next_one = tmp & -tmp;
3493   bits = clz_hwi (first_one) - clz_hwi (next_one);
3494   mask = val ^ tmp;
3495
3496   /* Check the bit position difference is a power of 2, and that the first
3497      sequence of one bits fits within 'bits' bits.  */
3498   if ((mask >> bits) != 0 || bits != (bits & -bits))
3499     return false;
3500
3501   /* Check the sequence of one bits is repeated 64/bits times.  */
3502   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3503 }
3504
3505
3506 /* Return true if val is an immediate that can be loaded into a
3507    register in a single instruction.  */
3508 bool
3509 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3510 {
3511   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3512     return 1;
3513   return aarch64_bitmask_imm (val, mode);
3514 }
3515
3516 static bool
3517 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3518 {
3519   rtx base, offset;
3520
3521   if (GET_CODE (x) == HIGH)
3522     return true;
3523
3524   split_const (x, &base, &offset);
3525   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3526     {
3527       if (aarch64_classify_symbol (base, offset)
3528           != SYMBOL_FORCE_TO_MEM)
3529         return true;
3530       else
3531         /* Avoid generating a 64-bit relocation in ILP32; leave
3532            to aarch64_expand_mov_immediate to handle it properly.  */
3533         return mode != ptr_mode;
3534     }
3535
3536   return aarch64_tls_referenced_p (x);
3537 }
3538
3539 /* Implement TARGET_CASE_VALUES_THRESHOLD.  */
3540
3541 static unsigned int
3542 aarch64_case_values_threshold (void)
3543 {
3544   /* Use the specified limit for the number of cases before using jump
3545      tables at higher optimization levels.  */
3546   if (optimize > 2
3547       && selected_cpu->tune->max_case_values != 0)
3548     return selected_cpu->tune->max_case_values;
3549   else
3550     return default_case_values_threshold ();
3551 }
3552
3553 /* Return true if register REGNO is a valid index register.
3554    STRICT_P is true if REG_OK_STRICT is in effect.  */
3555
3556 bool
3557 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3558 {
3559   if (!HARD_REGISTER_NUM_P (regno))
3560     {
3561       if (!strict_p)
3562         return true;
3563
3564       if (!reg_renumber)
3565         return false;
3566
3567       regno = reg_renumber[regno];
3568     }
3569   return GP_REGNUM_P (regno);
3570 }
3571
3572 /* Return true if register REGNO is a valid base register for mode MODE.
3573    STRICT_P is true if REG_OK_STRICT is in effect.  */
3574
3575 bool
3576 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3577 {
3578   if (!HARD_REGISTER_NUM_P (regno))
3579     {
3580       if (!strict_p)
3581         return true;
3582
3583       if (!reg_renumber)
3584         return false;
3585
3586       regno = reg_renumber[regno];
3587     }
3588
3589   /* The fake registers will be eliminated to either the stack or
3590      hard frame pointer, both of which are usually valid base registers.
3591      Reload deals with the cases where the eliminated form isn't valid.  */
3592   return (GP_REGNUM_P (regno)
3593           || regno == SP_REGNUM
3594           || regno == FRAME_POINTER_REGNUM
3595           || regno == ARG_POINTER_REGNUM);
3596 }
3597
3598 /* Return true if X is a valid base register for mode MODE.
3599    STRICT_P is true if REG_OK_STRICT is in effect.  */
3600
3601 static bool
3602 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3603 {
3604   if (!strict_p && GET_CODE (x) == SUBREG)
3605     x = SUBREG_REG (x);
3606
3607   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3608 }
3609
3610 /* Return true if address offset is a valid index.  If it is, fill in INFO
3611    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3612
3613 static bool
3614 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3615                         machine_mode mode, bool strict_p)
3616 {
3617   enum aarch64_address_type type;
3618   rtx index;
3619   int shift;
3620
3621   /* (reg:P) */
3622   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3623       && GET_MODE (x) == Pmode)
3624     {
3625       type = ADDRESS_REG_REG;
3626       index = x;
3627       shift = 0;
3628     }
3629   /* (sign_extend:DI (reg:SI)) */
3630   else if ((GET_CODE (x) == SIGN_EXTEND
3631             || GET_CODE (x) == ZERO_EXTEND)
3632            && GET_MODE (x) == DImode
3633            && GET_MODE (XEXP (x, 0)) == SImode)
3634     {
3635       type = (GET_CODE (x) == SIGN_EXTEND)
3636         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3637       index = XEXP (x, 0);
3638       shift = 0;
3639     }
3640   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3641   else if (GET_CODE (x) == MULT
3642            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3643                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3644            && GET_MODE (XEXP (x, 0)) == DImode
3645            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3646            && CONST_INT_P (XEXP (x, 1)))
3647     {
3648       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3649         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3650       index = XEXP (XEXP (x, 0), 0);
3651       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3652     }
3653   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3654   else if (GET_CODE (x) == ASHIFT
3655            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3656                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3657            && GET_MODE (XEXP (x, 0)) == DImode
3658            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3659            && CONST_INT_P (XEXP (x, 1)))
3660     {
3661       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3662         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3663       index = XEXP (XEXP (x, 0), 0);
3664       shift = INTVAL (XEXP (x, 1));
3665     }
3666   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3667   else if ((GET_CODE (x) == SIGN_EXTRACT
3668             || GET_CODE (x) == ZERO_EXTRACT)
3669            && GET_MODE (x) == DImode
3670            && GET_CODE (XEXP (x, 0)) == MULT
3671            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3672            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3673     {
3674       type = (GET_CODE (x) == SIGN_EXTRACT)
3675         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3676       index = XEXP (XEXP (x, 0), 0);
3677       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3678       if (INTVAL (XEXP (x, 1)) != 32 + shift
3679           || INTVAL (XEXP (x, 2)) != 0)
3680         shift = -1;
3681     }
3682   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3683      (const_int 0xffffffff<<shift)) */
3684   else if (GET_CODE (x) == AND
3685            && GET_MODE (x) == DImode
3686            && GET_CODE (XEXP (x, 0)) == MULT
3687            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3688            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3689            && CONST_INT_P (XEXP (x, 1)))
3690     {
3691       type = ADDRESS_REG_UXTW;
3692       index = XEXP (XEXP (x, 0), 0);
3693       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3694       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3695         shift = -1;
3696     }
3697   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3698   else if ((GET_CODE (x) == SIGN_EXTRACT
3699             || GET_CODE (x) == ZERO_EXTRACT)
3700            && GET_MODE (x) == DImode
3701            && GET_CODE (XEXP (x, 0)) == ASHIFT
3702            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3703            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3704     {
3705       type = (GET_CODE (x) == SIGN_EXTRACT)
3706         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3707       index = XEXP (XEXP (x, 0), 0);
3708       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3709       if (INTVAL (XEXP (x, 1)) != 32 + shift
3710           || INTVAL (XEXP (x, 2)) != 0)
3711         shift = -1;
3712     }
3713   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3714      (const_int 0xffffffff<<shift)) */
3715   else if (GET_CODE (x) == AND
3716            && GET_MODE (x) == DImode
3717            && GET_CODE (XEXP (x, 0)) == ASHIFT
3718            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3719            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3720            && CONST_INT_P (XEXP (x, 1)))
3721     {
3722       type = ADDRESS_REG_UXTW;
3723       index = XEXP (XEXP (x, 0), 0);
3724       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3725       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3726         shift = -1;
3727     }
3728   /* (mult:P (reg:P) (const_int scale)) */
3729   else if (GET_CODE (x) == MULT
3730            && GET_MODE (x) == Pmode
3731            && GET_MODE (XEXP (x, 0)) == Pmode
3732            && CONST_INT_P (XEXP (x, 1)))
3733     {
3734       type = ADDRESS_REG_REG;
3735       index = XEXP (x, 0);
3736       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3737     }
3738   /* (ashift:P (reg:P) (const_int shift)) */
3739   else if (GET_CODE (x) == ASHIFT
3740            && GET_MODE (x) == Pmode
3741            && GET_MODE (XEXP (x, 0)) == Pmode
3742            && CONST_INT_P (XEXP (x, 1)))
3743     {
3744       type = ADDRESS_REG_REG;
3745       index = XEXP (x, 0);
3746       shift = INTVAL (XEXP (x, 1));
3747     }
3748   else
3749     return false;
3750
3751   if (GET_CODE (index) == SUBREG)
3752     index = SUBREG_REG (index);
3753
3754   if ((shift == 0 ||
3755        (shift > 0 && shift <= 3
3756         && (1 << shift) == GET_MODE_SIZE (mode)))
3757       && REG_P (index)
3758       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3759     {
3760       info->type = type;
3761       info->offset = index;
3762       info->shift = shift;
3763       return true;
3764     }
3765
3766   return false;
3767 }
3768
3769 bool
3770 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3771 {
3772   return (offset >= -64 * GET_MODE_SIZE (mode)
3773           && offset < 64 * GET_MODE_SIZE (mode)
3774           && offset % GET_MODE_SIZE (mode) == 0);
3775 }
3776
3777 static inline bool
3778 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3779                                HOST_WIDE_INT offset)
3780 {
3781   return offset >= -256 && offset < 256;
3782 }
3783
3784 static inline bool
3785 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3786 {
3787   return (offset >= 0
3788           && offset < 4096 * GET_MODE_SIZE (mode)
3789           && offset % GET_MODE_SIZE (mode) == 0);
3790 }
3791
3792 /* Return true if MODE is one of the modes for which we
3793    support LDP/STP operations.  */
3794
3795 static bool
3796 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
3797 {
3798   return mode == SImode || mode == DImode
3799          || mode == SFmode || mode == DFmode
3800          || (aarch64_vector_mode_supported_p (mode)
3801              && GET_MODE_SIZE (mode) == 8);
3802 }
3803
3804 /* Return true if X is a valid address for machine mode MODE.  If it is,
3805    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3806    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3807
3808 static bool
3809 aarch64_classify_address (struct aarch64_address_info *info,
3810                           rtx x, machine_mode mode,
3811                           RTX_CODE outer_code, bool strict_p)
3812 {
3813   enum rtx_code code = GET_CODE (x);
3814   rtx op0, op1;
3815
3816   /* On BE, we use load/store pair for all large int mode load/stores.  */
3817   bool load_store_pair_p = (outer_code == PARALLEL
3818                             || (BYTES_BIG_ENDIAN
3819                                 && aarch64_vect_struct_mode_p (mode)));
3820
3821   bool allow_reg_index_p =
3822     !load_store_pair_p
3823     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3824     && !aarch64_vect_struct_mode_p (mode);
3825
3826   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3827      REG addressing.  */
3828   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3829       && (code != POST_INC && code != REG))
3830     return false;
3831
3832   switch (code)
3833     {
3834     case REG:
3835     case SUBREG:
3836       info->type = ADDRESS_REG_IMM;
3837       info->base = x;
3838       info->offset = const0_rtx;
3839       return aarch64_base_register_rtx_p (x, strict_p);
3840
3841     case PLUS:
3842       op0 = XEXP (x, 0);
3843       op1 = XEXP (x, 1);
3844
3845       if (! strict_p
3846           && REG_P (op0)
3847           && (op0 == virtual_stack_vars_rtx
3848               || op0 == frame_pointer_rtx
3849               || op0 == arg_pointer_rtx)
3850           && CONST_INT_P (op1))
3851         {
3852           info->type = ADDRESS_REG_IMM;
3853           info->base = op0;
3854           info->offset = op1;
3855
3856           return true;
3857         }
3858
3859       if (GET_MODE_SIZE (mode) != 0
3860           && CONST_INT_P (op1)
3861           && aarch64_base_register_rtx_p (op0, strict_p))
3862         {
3863           HOST_WIDE_INT offset = INTVAL (op1);
3864
3865           info->type = ADDRESS_REG_IMM;
3866           info->base = op0;
3867           info->offset = op1;
3868
3869           /* TImode and TFmode values are allowed in both pairs of X
3870              registers and individual Q registers.  The available
3871              address modes are:
3872              X,X: 7-bit signed scaled offset
3873              Q:   9-bit signed offset
3874              We conservatively require an offset representable in either mode.
3875            */
3876           if (mode == TImode || mode == TFmode)
3877             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3878                     && offset_9bit_signed_unscaled_p (mode, offset));
3879
3880           /* A 7bit offset check because OImode will emit a ldp/stp
3881              instruction (only big endian will get here).
3882              For ldp/stp instructions, the offset is scaled for the size of a
3883              single element of the pair.  */
3884           if (mode == OImode)
3885             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3886
3887           /* Three 9/12 bit offsets checks because CImode will emit three
3888              ldr/str instructions (only big endian will get here).  */
3889           if (mode == CImode)
3890             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3891                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3892                         || offset_12bit_unsigned_scaled_p (V16QImode,
3893                                                            offset + 32)));
3894
3895           /* Two 7bit offsets checks because XImode will emit two ldp/stp
3896              instructions (only big endian will get here).  */
3897           if (mode == XImode)
3898             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3899                     && aarch64_offset_7bit_signed_scaled_p (TImode,
3900                                                             offset + 32));
3901
3902           if (load_store_pair_p)
3903             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3904                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3905           else
3906             return (offset_9bit_signed_unscaled_p (mode, offset)
3907                     || offset_12bit_unsigned_scaled_p (mode, offset));
3908         }
3909
3910       if (allow_reg_index_p)
3911         {
3912           /* Look for base + (scaled/extended) index register.  */
3913           if (aarch64_base_register_rtx_p (op0, strict_p)
3914               && aarch64_classify_index (info, op1, mode, strict_p))
3915             {
3916               info->base = op0;
3917               return true;
3918             }
3919           if (aarch64_base_register_rtx_p (op1, strict_p)
3920               && aarch64_classify_index (info, op0, mode, strict_p))
3921             {
3922               info->base = op1;
3923               return true;
3924             }
3925         }
3926
3927       return false;
3928
3929     case POST_INC:
3930     case POST_DEC:
3931     case PRE_INC:
3932     case PRE_DEC:
3933       info->type = ADDRESS_REG_WB;
3934       info->base = XEXP (x, 0);
3935       info->offset = NULL_RTX;
3936       return aarch64_base_register_rtx_p (info->base, strict_p);
3937
3938     case POST_MODIFY:
3939     case PRE_MODIFY:
3940       info->type = ADDRESS_REG_WB;
3941       info->base = XEXP (x, 0);
3942       if (GET_CODE (XEXP (x, 1)) == PLUS
3943           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3944           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3945           && aarch64_base_register_rtx_p (info->base, strict_p))
3946         {
3947           HOST_WIDE_INT offset;
3948           info->offset = XEXP (XEXP (x, 1), 1);
3949           offset = INTVAL (info->offset);
3950
3951           /* TImode and TFmode values are allowed in both pairs of X
3952              registers and individual Q registers.  The available
3953              address modes are:
3954              X,X: 7-bit signed scaled offset
3955              Q:   9-bit signed offset
3956              We conservatively require an offset representable in either mode.
3957            */
3958           if (mode == TImode || mode == TFmode)
3959             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3960                     && offset_9bit_signed_unscaled_p (mode, offset));
3961
3962           if (load_store_pair_p)
3963             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3964                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3965           else
3966             return offset_9bit_signed_unscaled_p (mode, offset);
3967         }
3968       return false;
3969
3970     case CONST:
3971     case SYMBOL_REF:
3972     case LABEL_REF:
3973       /* load literal: pc-relative constant pool entry.  Only supported
3974          for SI mode or larger.  */
3975       info->type = ADDRESS_SYMBOLIC;
3976
3977       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3978         {
3979           rtx sym, addend;
3980
3981           split_const (x, &sym, &addend);
3982           return ((GET_CODE (sym) == LABEL_REF
3983                    || (GET_CODE (sym) == SYMBOL_REF
3984                        && CONSTANT_POOL_ADDRESS_P (sym)
3985                        && !aarch64_nopcrelative_literal_loads)));
3986         }
3987       return false;
3988
3989     case LO_SUM:
3990       info->type = ADDRESS_LO_SUM;
3991       info->base = XEXP (x, 0);
3992       info->offset = XEXP (x, 1);
3993       if (allow_reg_index_p
3994           && aarch64_base_register_rtx_p (info->base, strict_p))
3995         {
3996           rtx sym, offs;
3997           split_const (info->offset, &sym, &offs);
3998           if (GET_CODE (sym) == SYMBOL_REF
3999               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4000             {
4001               /* The symbol and offset must be aligned to the access size.  */
4002               unsigned int align;
4003               unsigned int ref_size;
4004
4005               if (CONSTANT_POOL_ADDRESS_P (sym))
4006                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4007               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4008                 {
4009                   tree exp = SYMBOL_REF_DECL (sym);
4010                   align = TYPE_ALIGN (TREE_TYPE (exp));
4011                   align = CONSTANT_ALIGNMENT (exp, align);
4012                 }
4013               else if (SYMBOL_REF_DECL (sym))
4014                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4015               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4016                        && SYMBOL_REF_BLOCK (sym) != NULL)
4017                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4018               else
4019                 align = BITS_PER_UNIT;
4020
4021               ref_size = GET_MODE_SIZE (mode);
4022               if (ref_size == 0)
4023                 ref_size = GET_MODE_SIZE (DImode);
4024
4025               return ((INTVAL (offs) & (ref_size - 1)) == 0
4026                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4027             }
4028         }
4029       return false;
4030
4031     default:
4032       return false;
4033     }
4034 }
4035
4036 bool
4037 aarch64_symbolic_address_p (rtx x)
4038 {
4039   rtx offset;
4040
4041   split_const (x, &x, &offset);
4042   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4043 }
4044
4045 /* Classify the base of symbolic expression X.  */
4046
4047 enum aarch64_symbol_type
4048 aarch64_classify_symbolic_expression (rtx x)
4049 {
4050   rtx offset;
4051
4052   split_const (x, &x, &offset);
4053   return aarch64_classify_symbol (x, offset);
4054 }
4055
4056
4057 /* Return TRUE if X is a legitimate address for accessing memory in
4058    mode MODE.  */
4059 static bool
4060 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4061 {
4062   struct aarch64_address_info addr;
4063
4064   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4065 }
4066
4067 /* Return TRUE if X is a legitimate address for accessing memory in
4068    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4069    pair operation.  */
4070 bool
4071 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4072                               RTX_CODE outer_code, bool strict_p)
4073 {
4074   struct aarch64_address_info addr;
4075
4076   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4077 }
4078
4079 /* Return TRUE if rtx X is immediate constant 0.0 */
4080 bool
4081 aarch64_float_const_zero_rtx_p (rtx x)
4082 {
4083   if (GET_MODE (x) == VOIDmode)
4084     return false;
4085
4086   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4087     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4088   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4089 }
4090
4091 /* Return the fixed registers used for condition codes.  */
4092
4093 static bool
4094 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4095 {
4096   *p1 = CC_REGNUM;
4097   *p2 = INVALID_REGNUM;
4098   return true;
4099 }
4100
4101 /* Emit call insn with PAT and do aarch64-specific handling.  */
4102
4103 void
4104 aarch64_emit_call_insn (rtx pat)
4105 {
4106   rtx insn = emit_call_insn (pat);
4107
4108   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4109   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4110   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4111 }
4112
4113 machine_mode
4114 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4115 {
4116   /* All floating point compares return CCFP if it is an equality
4117      comparison, and CCFPE otherwise.  */
4118   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4119     {
4120       switch (code)
4121         {
4122         case EQ:
4123         case NE:
4124         case UNORDERED:
4125         case ORDERED:
4126         case UNLT:
4127         case UNLE:
4128         case UNGT:
4129         case UNGE:
4130         case UNEQ:
4131         case LTGT:
4132           return CCFPmode;
4133
4134         case LT:
4135         case LE:
4136         case GT:
4137         case GE:
4138           return CCFPEmode;
4139
4140         default:
4141           gcc_unreachable ();
4142         }
4143     }
4144
4145   /* Equality comparisons of short modes against zero can be performed
4146      using the TST instruction with the appropriate bitmask.  */
4147   if (y == const0_rtx && REG_P (x)
4148       && (code == EQ || code == NE)
4149       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4150     return CC_NZmode;
4151
4152   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4153       && y == const0_rtx
4154       && (code == EQ || code == NE || code == LT || code == GE)
4155       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4156           || GET_CODE (x) == NEG
4157           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4158               && CONST_INT_P (XEXP (x, 2)))))
4159     return CC_NZmode;
4160
4161   /* A compare with a shifted operand.  Because of canonicalization,
4162      the comparison will have to be swapped when we emit the assembly
4163      code.  */
4164   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4165       && (REG_P (y) || GET_CODE (y) == SUBREG)
4166       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4167           || GET_CODE (x) == LSHIFTRT
4168           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4169     return CC_SWPmode;
4170
4171   /* Similarly for a negated operand, but we can only do this for
4172      equalities.  */
4173   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4174       && (REG_P (y) || GET_CODE (y) == SUBREG)
4175       && (code == EQ || code == NE)
4176       && GET_CODE (x) == NEG)
4177     return CC_Zmode;
4178
4179   /* A compare of a mode narrower than SI mode against zero can be done
4180      by extending the value in the comparison.  */
4181   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
4182       && y == const0_rtx)
4183     /* Only use sign-extension if we really need it.  */
4184     return ((code == GT || code == GE || code == LE || code == LT)
4185             ? CC_SESWPmode : CC_ZESWPmode);
4186
4187   /* For everything else, return CCmode.  */
4188   return CCmode;
4189 }
4190
4191 static int
4192 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4193
4194 int
4195 aarch64_get_condition_code (rtx x)
4196 {
4197   machine_mode mode = GET_MODE (XEXP (x, 0));
4198   enum rtx_code comp_code = GET_CODE (x);
4199
4200   if (GET_MODE_CLASS (mode) != MODE_CC)
4201     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4202   return aarch64_get_condition_code_1 (mode, comp_code);
4203 }
4204
4205 static int
4206 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4207 {
4208   int ne = -1, eq = -1;
4209   switch (mode)
4210     {
4211     case CCFPmode:
4212     case CCFPEmode:
4213       switch (comp_code)
4214         {
4215         case GE: return AARCH64_GE;
4216         case GT: return AARCH64_GT;
4217         case LE: return AARCH64_LS;
4218         case LT: return AARCH64_MI;
4219         case NE: return AARCH64_NE;
4220         case EQ: return AARCH64_EQ;
4221         case ORDERED: return AARCH64_VC;
4222         case UNORDERED: return AARCH64_VS;
4223         case UNLT: return AARCH64_LT;
4224         case UNLE: return AARCH64_LE;
4225         case UNGT: return AARCH64_HI;
4226         case UNGE: return AARCH64_PL;
4227         default: return -1;
4228         }
4229       break;
4230
4231     case CC_DNEmode:
4232       ne = AARCH64_NE;
4233       eq = AARCH64_EQ;
4234       break;
4235
4236     case CC_DEQmode:
4237       ne = AARCH64_EQ;
4238       eq = AARCH64_NE;
4239       break;
4240
4241     case CC_DGEmode:
4242       ne = AARCH64_GE;
4243       eq = AARCH64_LT;
4244       break;
4245
4246     case CC_DLTmode:
4247       ne = AARCH64_LT;
4248       eq = AARCH64_GE;
4249       break;
4250
4251     case CC_DGTmode:
4252       ne = AARCH64_GT;
4253       eq = AARCH64_LE;
4254       break;
4255
4256     case CC_DLEmode:
4257       ne = AARCH64_LE;
4258       eq = AARCH64_GT;
4259       break;
4260
4261     case CC_DGEUmode:
4262       ne = AARCH64_CS;
4263       eq = AARCH64_CC;
4264       break;
4265
4266     case CC_DLTUmode:
4267       ne = AARCH64_CC;
4268       eq = AARCH64_CS;
4269       break;
4270
4271     case CC_DGTUmode:
4272       ne = AARCH64_HI;
4273       eq = AARCH64_LS;
4274       break;
4275
4276     case CC_DLEUmode:
4277       ne = AARCH64_LS;
4278       eq = AARCH64_HI;
4279       break;
4280
4281     case CCmode:
4282       switch (comp_code)
4283         {
4284         case NE: return AARCH64_NE;
4285         case EQ: return AARCH64_EQ;
4286         case GE: return AARCH64_GE;
4287         case GT: return AARCH64_GT;
4288         case LE: return AARCH64_LE;
4289         case LT: return AARCH64_LT;
4290         case GEU: return AARCH64_CS;
4291         case GTU: return AARCH64_HI;
4292         case LEU: return AARCH64_LS;
4293         case LTU: return AARCH64_CC;
4294         default: return -1;
4295         }
4296       break;
4297
4298     case CC_SWPmode:
4299     case CC_ZESWPmode:
4300     case CC_SESWPmode:
4301       switch (comp_code)
4302         {
4303         case NE: return AARCH64_NE;
4304         case EQ: return AARCH64_EQ;
4305         case GE: return AARCH64_LE;
4306         case GT: return AARCH64_LT;
4307         case LE: return AARCH64_GE;
4308         case LT: return AARCH64_GT;
4309         case GEU: return AARCH64_LS;
4310         case GTU: return AARCH64_CC;
4311         case LEU: return AARCH64_CS;
4312         case LTU: return AARCH64_HI;
4313         default: return -1;
4314         }
4315       break;
4316
4317     case CC_NZmode:
4318       switch (comp_code)
4319         {
4320         case NE: return AARCH64_NE;
4321         case EQ: return AARCH64_EQ;
4322         case GE: return AARCH64_PL;
4323         case LT: return AARCH64_MI;
4324         default: return -1;
4325         }
4326       break;
4327
4328     case CC_Zmode:
4329       switch (comp_code)
4330         {
4331         case NE: return AARCH64_NE;
4332         case EQ: return AARCH64_EQ;
4333         default: return -1;
4334         }
4335       break;
4336
4337     default:
4338       return -1;
4339       break;
4340     }
4341
4342   if (comp_code == NE)
4343     return ne;
4344
4345   if (comp_code == EQ)
4346     return eq;
4347
4348   return -1;
4349 }
4350
4351 bool
4352 aarch64_const_vec_all_same_in_range_p (rtx x,
4353                                   HOST_WIDE_INT minval,
4354                                   HOST_WIDE_INT maxval)
4355 {
4356   HOST_WIDE_INT firstval;
4357   int count, i;
4358
4359   if (GET_CODE (x) != CONST_VECTOR
4360       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4361     return false;
4362
4363   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4364   if (firstval < minval || firstval > maxval)
4365     return false;
4366
4367   count = CONST_VECTOR_NUNITS (x);
4368   for (i = 1; i < count; i++)
4369     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4370       return false;
4371
4372   return true;
4373 }
4374
4375 bool
4376 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4377 {
4378   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4379 }
4380
4381
4382 /* N Z C V.  */
4383 #define AARCH64_CC_V 1
4384 #define AARCH64_CC_C (1 << 1)
4385 #define AARCH64_CC_Z (1 << 2)
4386 #define AARCH64_CC_N (1 << 3)
4387
4388 /* N Z C V flags for ccmp.  The first code is for AND op and the other
4389    is for IOR op.  Indexed by AARCH64_COND_CODE.  */
4390 static const int aarch64_nzcv_codes[][2] =
4391 {
4392   {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
4393   {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
4394   {AARCH64_CC_C, 0}, /* CS, C == 1.  */
4395   {0, AARCH64_CC_C}, /* CC, C == 0.  */
4396   {AARCH64_CC_N, 0}, /* MI, N == 1.  */
4397   {0, AARCH64_CC_N}, /* PL, N == 0.  */
4398   {AARCH64_CC_V, 0}, /* VS, V == 1.  */
4399   {0, AARCH64_CC_V}, /* VC, V == 0.  */
4400   {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
4401   {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
4402   {0, AARCH64_CC_V}, /* GE, N == V.  */
4403   {AARCH64_CC_V, 0}, /* LT, N != V.  */
4404   {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
4405   {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
4406   {0, 0}, /* AL, Any.  */
4407   {0, 0}, /* NV, Any.  */
4408 };
4409
4410 int
4411 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4412 {
4413   switch (mode)
4414     {
4415     case CC_DNEmode:
4416       return NE;
4417
4418     case CC_DEQmode:
4419       return EQ;
4420
4421     case CC_DLEmode:
4422       return LE;
4423
4424     case CC_DGTmode:
4425       return GT;
4426
4427     case CC_DLTmode:
4428       return LT;
4429
4430     case CC_DGEmode:
4431       return GE;
4432
4433     case CC_DLEUmode:
4434       return LEU;
4435
4436     case CC_DGTUmode:
4437       return GTU;
4438
4439     case CC_DLTUmode:
4440       return LTU;
4441
4442     case CC_DGEUmode:
4443       return GEU;
4444
4445     default:
4446       gcc_unreachable ();
4447     }
4448 }
4449
4450
4451 static void
4452 aarch64_print_operand (FILE *f, rtx x, int code)
4453 {
4454   switch (code)
4455     {
4456     /* An integer or symbol address without a preceding # sign.  */
4457     case 'c':
4458       switch (GET_CODE (x))
4459         {
4460         case CONST_INT:
4461           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4462           break;
4463
4464         case SYMBOL_REF:
4465           output_addr_const (f, x);
4466           break;
4467
4468         case CONST:
4469           if (GET_CODE (XEXP (x, 0)) == PLUS
4470               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4471             {
4472               output_addr_const (f, x);
4473               break;
4474             }
4475           /* Fall through.  */
4476
4477         default:
4478           output_operand_lossage ("Unsupported operand for code '%c'", code);
4479         }
4480       break;
4481
4482     case 'e':
4483       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4484       {
4485         int n;
4486
4487         if (!CONST_INT_P (x)
4488             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4489           {
4490             output_operand_lossage ("invalid operand for '%%%c'", code);
4491             return;
4492           }
4493
4494         switch (n)
4495           {
4496           case 3:
4497             fputc ('b', f);
4498             break;
4499           case 4:
4500             fputc ('h', f);
4501             break;
4502           case 5:
4503             fputc ('w', f);
4504             break;
4505           default:
4506             output_operand_lossage ("invalid operand for '%%%c'", code);
4507             return;
4508           }
4509       }
4510       break;
4511
4512     case 'p':
4513       {
4514         int n;
4515
4516         /* Print N such that 2^N == X.  */
4517         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4518           {
4519             output_operand_lossage ("invalid operand for '%%%c'", code);
4520             return;
4521           }
4522
4523         asm_fprintf (f, "%d", n);
4524       }
4525       break;
4526
4527     case 'P':
4528       /* Print the number of non-zero bits in X (a const_int).  */
4529       if (!CONST_INT_P (x))
4530         {
4531           output_operand_lossage ("invalid operand for '%%%c'", code);
4532           return;
4533         }
4534
4535       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4536       break;
4537
4538     case 'H':
4539       /* Print the higher numbered register of a pair (TImode) of regs.  */
4540       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4541         {
4542           output_operand_lossage ("invalid operand for '%%%c'", code);
4543           return;
4544         }
4545
4546       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4547       break;
4548
4549     case 'm':
4550       {
4551         int cond_code;
4552         /* Print a condition (eq, ne, etc).  */
4553
4554         /* CONST_TRUE_RTX means always -- that's the default.  */
4555         if (x == const_true_rtx)
4556           return;
4557
4558         if (!COMPARISON_P (x))
4559           {
4560             output_operand_lossage ("invalid operand for '%%%c'", code);
4561             return;
4562           }
4563
4564         cond_code = aarch64_get_condition_code (x);
4565         gcc_assert (cond_code >= 0);
4566         fputs (aarch64_condition_codes[cond_code], f);
4567       }
4568       break;
4569
4570     case 'M':
4571       {
4572         int cond_code;
4573         /* Print the inverse of a condition (eq <-> ne, etc).  */
4574
4575         /* CONST_TRUE_RTX means never -- that's the default.  */
4576         if (x == const_true_rtx)
4577           {
4578             fputs ("nv", f);
4579             return;
4580           }
4581
4582         if (!COMPARISON_P (x))
4583           {
4584             output_operand_lossage ("invalid operand for '%%%c'", code);
4585             return;
4586           }
4587         cond_code = aarch64_get_condition_code (x);
4588         gcc_assert (cond_code >= 0);
4589         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4590                                        (cond_code)], f);
4591       }
4592       break;
4593
4594     case 'b':
4595     case 'h':
4596     case 's':
4597     case 'd':
4598     case 'q':
4599       /* Print a scalar FP/SIMD register name.  */
4600       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4601         {
4602           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4603           return;
4604         }
4605       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4606       break;
4607
4608     case 'S':
4609     case 'T':
4610     case 'U':
4611     case 'V':
4612       /* Print the first FP/SIMD register name in a list.  */
4613       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4614         {
4615           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4616           return;
4617         }
4618       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4619       break;
4620
4621     case 'R':
4622       /* Print a scalar FP/SIMD register name + 1.  */
4623       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4624         {
4625           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4626           return;
4627         }
4628       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4629       break;
4630
4631     case 'X':
4632       /* Print bottom 16 bits of integer constant in hex.  */
4633       if (!CONST_INT_P (x))
4634         {
4635           output_operand_lossage ("invalid operand for '%%%c'", code);
4636           return;
4637         }
4638       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4639       break;
4640
4641     case 'w':
4642     case 'x':
4643       /* Print a general register name or the zero register (32-bit or
4644          64-bit).  */
4645       if (x == const0_rtx
4646           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4647         {
4648           asm_fprintf (f, "%czr", code);
4649           break;
4650         }
4651
4652       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4653         {
4654           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4655           break;
4656         }
4657
4658       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4659         {
4660           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4661           break;
4662         }
4663
4664       /* Fall through */
4665
4666     case 0:
4667       /* Print a normal operand, if it's a general register, then we
4668          assume DImode.  */
4669       if (x == NULL)
4670         {
4671           output_operand_lossage ("missing operand");
4672           return;
4673         }
4674
4675       switch (GET_CODE (x))
4676         {
4677         case REG:
4678           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4679           break;
4680
4681         case MEM:
4682           output_address (GET_MODE (x), XEXP (x, 0));
4683           break;
4684
4685         case CONST:
4686         case LABEL_REF:
4687         case SYMBOL_REF:
4688           output_addr_const (asm_out_file, x);
4689           break;
4690
4691         case CONST_INT:
4692           asm_fprintf (f, "%wd", INTVAL (x));
4693           break;
4694
4695         case CONST_VECTOR:
4696           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4697             {
4698               gcc_assert (
4699                   aarch64_const_vec_all_same_in_range_p (x,
4700                                                          HOST_WIDE_INT_MIN,
4701                                                          HOST_WIDE_INT_MAX));
4702               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4703             }
4704           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4705             {
4706               fputc ('0', f);
4707             }
4708           else
4709             gcc_unreachable ();
4710           break;
4711
4712         case CONST_DOUBLE:
4713           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4714              be getting CONST_DOUBLEs holding integers.  */
4715           gcc_assert (GET_MODE (x) != VOIDmode);
4716           if (aarch64_float_const_zero_rtx_p (x))
4717             {
4718               fputc ('0', f);
4719               break;
4720             }
4721           else if (aarch64_float_const_representable_p (x))
4722             {
4723 #define buf_size 20
4724               char float_buf[buf_size] = {'\0'};
4725               real_to_decimal_for_mode (float_buf,
4726                                         CONST_DOUBLE_REAL_VALUE (x),
4727                                         buf_size, buf_size,
4728                                         1, GET_MODE (x));
4729               asm_fprintf (asm_out_file, "%s", float_buf);
4730               break;
4731 #undef buf_size
4732             }
4733           output_operand_lossage ("invalid constant");
4734           return;
4735         default:
4736           output_operand_lossage ("invalid operand");
4737           return;
4738         }
4739       break;
4740
4741     case 'A':
4742       if (GET_CODE (x) == HIGH)
4743         x = XEXP (x, 0);
4744
4745       switch (aarch64_classify_symbolic_expression (x))
4746         {
4747         case SYMBOL_SMALL_GOT_4G:
4748           asm_fprintf (asm_out_file, ":got:");
4749           break;
4750
4751         case SYMBOL_SMALL_TLSGD:
4752           asm_fprintf (asm_out_file, ":tlsgd:");
4753           break;
4754
4755         case SYMBOL_SMALL_TLSDESC:
4756           asm_fprintf (asm_out_file, ":tlsdesc:");
4757           break;
4758
4759         case SYMBOL_SMALL_TLSIE:
4760           asm_fprintf (asm_out_file, ":gottprel:");
4761           break;
4762
4763         case SYMBOL_TLSLE24:
4764           asm_fprintf (asm_out_file, ":tprel:");
4765           break;
4766
4767         case SYMBOL_TINY_GOT:
4768           gcc_unreachable ();
4769           break;
4770
4771         default:
4772           break;
4773         }
4774       output_addr_const (asm_out_file, x);
4775       break;
4776
4777     case 'L':
4778       switch (aarch64_classify_symbolic_expression (x))
4779         {
4780         case SYMBOL_SMALL_GOT_4G:
4781           asm_fprintf (asm_out_file, ":lo12:");
4782           break;
4783
4784         case SYMBOL_SMALL_TLSGD:
4785           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4786           break;
4787
4788         case SYMBOL_SMALL_TLSDESC:
4789           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4790           break;
4791
4792         case SYMBOL_SMALL_TLSIE:
4793           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4794           break;
4795
4796         case SYMBOL_TLSLE12:
4797           asm_fprintf (asm_out_file, ":tprel_lo12:");
4798           break;
4799
4800         case SYMBOL_TLSLE24:
4801           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4802           break;
4803
4804         case SYMBOL_TINY_GOT:
4805           asm_fprintf (asm_out_file, ":got:");
4806           break;
4807
4808         case SYMBOL_TINY_TLSIE:
4809           asm_fprintf (asm_out_file, ":gottprel:");
4810           break;
4811
4812         default:
4813           break;
4814         }
4815       output_addr_const (asm_out_file, x);
4816       break;
4817
4818     case 'G':
4819
4820       switch (aarch64_classify_symbolic_expression (x))
4821         {
4822         case SYMBOL_TLSLE24:
4823           asm_fprintf (asm_out_file, ":tprel_hi12:");
4824           break;
4825         default:
4826           break;
4827         }
4828       output_addr_const (asm_out_file, x);
4829       break;
4830
4831     case 'K':
4832       {
4833         int cond_code;
4834         /* Print nzcv.  */
4835
4836         if (!COMPARISON_P (x))
4837           {
4838             output_operand_lossage ("invalid operand for '%%%c'", code);
4839             return;
4840           }
4841
4842         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4843         gcc_assert (cond_code >= 0);
4844         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4845       }
4846       break;
4847
4848     case 'k':
4849       {
4850         int cond_code;
4851         /* Print nzcv.  */
4852
4853         if (!COMPARISON_P (x))
4854           {
4855             output_operand_lossage ("invalid operand for '%%%c'", code);
4856             return;
4857           }
4858
4859         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4860         gcc_assert (cond_code >= 0);
4861         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4862       }
4863       break;
4864
4865     default:
4866       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4867       return;
4868     }
4869 }
4870
4871 static void
4872 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
4873 {
4874   struct aarch64_address_info addr;
4875
4876   if (aarch64_classify_address (&addr, x, mode, MEM, true))
4877     switch (addr.type)
4878       {
4879       case ADDRESS_REG_IMM:
4880         if (addr.offset == const0_rtx)
4881           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4882         else
4883           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4884                        INTVAL (addr.offset));
4885         return;
4886
4887       case ADDRESS_REG_REG:
4888         if (addr.shift == 0)
4889           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4890                        reg_names [REGNO (addr.offset)]);
4891         else
4892           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4893                        reg_names [REGNO (addr.offset)], addr.shift);
4894         return;
4895
4896       case ADDRESS_REG_UXTW:
4897         if (addr.shift == 0)
4898           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4899                        REGNO (addr.offset) - R0_REGNUM);
4900         else
4901           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4902                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4903         return;
4904
4905       case ADDRESS_REG_SXTW:
4906         if (addr.shift == 0)
4907           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4908                        REGNO (addr.offset) - R0_REGNUM);
4909         else
4910           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4911                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4912         return;
4913
4914       case ADDRESS_REG_WB:
4915         switch (GET_CODE (x))
4916           {
4917           case PRE_INC:
4918             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4919                          GET_MODE_SIZE (mode));
4920             return;
4921           case POST_INC:
4922             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4923                          GET_MODE_SIZE (mode));
4924             return;
4925           case PRE_DEC:
4926             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4927                          GET_MODE_SIZE (mode));
4928             return;
4929           case POST_DEC:
4930             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4931                          GET_MODE_SIZE (mode));
4932             return;
4933           case PRE_MODIFY:
4934             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4935                          INTVAL (addr.offset));
4936             return;
4937           case POST_MODIFY:
4938             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4939                          INTVAL (addr.offset));
4940             return;
4941           default:
4942             break;
4943           }
4944         break;
4945
4946       case ADDRESS_LO_SUM:
4947         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4948         output_addr_const (f, addr.offset);
4949         asm_fprintf (f, "]");
4950         return;
4951
4952       case ADDRESS_SYMBOLIC:
4953         break;
4954       }
4955
4956   output_addr_const (f, x);
4957 }
4958
4959 bool
4960 aarch64_label_mentioned_p (rtx x)
4961 {
4962   const char *fmt;
4963   int i;
4964
4965   if (GET_CODE (x) == LABEL_REF)
4966     return true;
4967
4968   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4969      referencing instruction, but they are constant offsets, not
4970      symbols.  */
4971   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4972     return false;
4973
4974   fmt = GET_RTX_FORMAT (GET_CODE (x));
4975   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4976     {
4977       if (fmt[i] == 'E')
4978         {
4979           int j;
4980
4981           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4982             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4983               return 1;
4984         }
4985       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4986         return 1;
4987     }
4988
4989   return 0;
4990 }
4991
4992 /* Implement REGNO_REG_CLASS.  */
4993
4994 enum reg_class
4995 aarch64_regno_regclass (unsigned regno)
4996 {
4997   if (GP_REGNUM_P (regno))
4998     return GENERAL_REGS;
4999
5000   if (regno == SP_REGNUM)
5001     return STACK_REG;
5002
5003   if (regno == FRAME_POINTER_REGNUM
5004       || regno == ARG_POINTER_REGNUM)
5005     return POINTER_REGS;
5006
5007   if (FP_REGNUM_P (regno))
5008     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5009
5010   return NO_REGS;
5011 }
5012
5013 static rtx
5014 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5015 {
5016   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5017      where mask is selected by alignment and size of the offset.
5018      We try to pick as large a range for the offset as possible to
5019      maximize the chance of a CSE.  However, for aligned addresses
5020      we limit the range to 4k so that structures with different sized
5021      elements are likely to use the same base.  We need to be careful
5022      not to split a CONST for some forms of address expression, otherwise
5023      it will generate sub-optimal code.  */
5024
5025   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5026     {
5027       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
5028       HOST_WIDE_INT base_offset;
5029
5030       if (GET_CODE (XEXP (x, 0)) == PLUS)
5031         {
5032           rtx op0 = XEXP (XEXP (x, 0), 0);
5033           rtx op1 = XEXP (XEXP (x, 0), 1);
5034
5035           /* Address expressions of the form Ra + Rb + CONST.
5036
5037              If CONST is within the range supported by the addressing
5038              mode "reg+offset", do not split CONST and use the
5039              sequence
5040                Rt = Ra + Rb;
5041                addr = Rt + CONST.  */
5042           if (REG_P (op0) && REG_P (op1))
5043             {
5044               machine_mode addr_mode = GET_MODE (x);
5045               rtx base = gen_reg_rtx (addr_mode);
5046               rtx addr = plus_constant (addr_mode, base, offset);
5047
5048               if (aarch64_legitimate_address_hook_p (mode, addr, false))
5049                 {
5050                   emit_insn (gen_adddi3 (base, op0, op1));
5051                   return addr;
5052                 }
5053             }
5054           /* Address expressions of the form Ra + Rb<<SCALE + CONST.
5055
5056              If Reg + Rb<<SCALE is a valid address expression, do not
5057              split CONST and use the sequence
5058                Rc = CONST;
5059                Rt = Ra + Rc;
5060                addr = Rt + Rb<<SCALE.
5061
5062              Here we split CONST out of memory referece because:
5063                a) We depend on GIMPLE optimizers to pick up common sub
5064                   expression involving the scaling operation.
5065                b) The index Rb is likely a loop iv, it's better to split
5066                   the CONST so that computation of new base Rt is a loop
5067                   invariant and can be moved out of loop.  This is more
5068                   important when the original base Ra is sfp related.  */
5069           else if (REG_P (op0) || REG_P (op1))
5070             {
5071               machine_mode addr_mode = GET_MODE (x);
5072               rtx base = gen_reg_rtx (addr_mode);
5073
5074               /* Switch to make sure that register is in op0.  */
5075               if (REG_P (op1))
5076                 std::swap (op0, op1);
5077
5078               rtx addr = gen_rtx_PLUS (addr_mode, op1, base);
5079
5080               if (aarch64_legitimate_address_hook_p (mode, addr, false))
5081                 {
5082                   base = force_operand (plus_constant (addr_mode,
5083                                                        op0, offset),
5084                                         NULL_RTX);
5085                   return gen_rtx_PLUS (addr_mode, op1, base);
5086                 }
5087             }
5088         }
5089
5090       /* Does it look like we'll need a load/store-pair operation?  */
5091       if (GET_MODE_SIZE (mode) > 16
5092           || mode == TImode)
5093         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
5094                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
5095       /* For offsets aren't a multiple of the access size, the limit is
5096          -256...255.  */
5097       else if (offset & (GET_MODE_SIZE (mode) - 1))
5098         base_offset = (offset + 0x100) & ~0x1ff;
5099       else
5100         base_offset = offset & ~0xfff;
5101
5102       if (base_offset == 0)
5103         return x;
5104
5105       offset -= base_offset;
5106       rtx base_reg = gen_reg_rtx (Pmode);
5107       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
5108                            NULL_RTX);
5109       emit_move_insn (base_reg, val);
5110       x = plus_constant (Pmode, base_reg, offset);
5111     }
5112
5113   return x;
5114 }
5115
5116 /* Try a machine-dependent way of reloading an illegitimate address
5117    operand.  If we find one, push the reload and return the new rtx.  */
5118
5119 rtx
5120 aarch64_legitimize_reload_address (rtx *x_p,
5121                                    machine_mode mode,
5122                                    int opnum, int type,
5123                                    int ind_levels ATTRIBUTE_UNUSED)
5124 {
5125   rtx x = *x_p;
5126
5127   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
5128   if (aarch64_vect_struct_mode_p (mode)
5129       && GET_CODE (x) == PLUS
5130       && REG_P (XEXP (x, 0))
5131       && CONST_INT_P (XEXP (x, 1)))
5132     {
5133       rtx orig_rtx = x;
5134       x = copy_rtx (x);
5135       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
5136                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
5137                    opnum, (enum reload_type) type);
5138       return x;
5139     }
5140
5141   /* We must recognize output that we have already generated ourselves.  */
5142   if (GET_CODE (x) == PLUS
5143       && GET_CODE (XEXP (x, 0)) == PLUS
5144       && REG_P (XEXP (XEXP (x, 0), 0))
5145       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5146       && CONST_INT_P (XEXP (x, 1)))
5147     {
5148       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
5149                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
5150                    opnum, (enum reload_type) type);
5151       return x;
5152     }
5153
5154   /* We wish to handle large displacements off a base register by splitting
5155      the addend across an add and the mem insn.  This can cut the number of
5156      extra insns needed from 3 to 1.  It is only useful for load/store of a
5157      single register with 12 bit offset field.  */
5158   if (GET_CODE (x) == PLUS
5159       && REG_P (XEXP (x, 0))
5160       && CONST_INT_P (XEXP (x, 1))
5161       && HARD_REGISTER_P (XEXP (x, 0))
5162       && mode != TImode
5163       && mode != TFmode
5164       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
5165     {
5166       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
5167       HOST_WIDE_INT low = val & 0xfff;
5168       HOST_WIDE_INT high = val - low;
5169       HOST_WIDE_INT offs;
5170       rtx cst;
5171       machine_mode xmode = GET_MODE (x);
5172
5173       /* In ILP32, xmode can be either DImode or SImode.  */
5174       gcc_assert (xmode == DImode || xmode == SImode);
5175
5176       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
5177          BLKmode alignment.  */
5178       if (GET_MODE_SIZE (mode) == 0)
5179         return NULL_RTX;
5180
5181       offs = low % GET_MODE_SIZE (mode);
5182
5183       /* Align misaligned offset by adjusting high part to compensate.  */
5184       if (offs != 0)
5185         {
5186           if (aarch64_uimm12_shift (high + offs))
5187             {
5188               /* Align down.  */
5189               low = low - offs;
5190               high = high + offs;
5191             }
5192           else
5193             {
5194               /* Align up.  */
5195               offs = GET_MODE_SIZE (mode) - offs;
5196               low = low + offs;
5197               high = high + (low & 0x1000) - offs;
5198               low &= 0xfff;
5199             }
5200         }
5201
5202       /* Check for overflow.  */
5203       if (high + low != val)
5204         return NULL_RTX;
5205
5206       cst = GEN_INT (high);
5207       if (!aarch64_uimm12_shift (high))
5208         cst = force_const_mem (xmode, cst);
5209
5210       /* Reload high part into base reg, leaving the low part
5211          in the mem instruction.
5212          Note that replacing this gen_rtx_PLUS with plus_constant is
5213          wrong in this case because we rely on the
5214          (plus (plus reg c1) c2) structure being preserved so that
5215          XEXP (*p, 0) in push_reload below uses the correct term.  */
5216       x = gen_rtx_PLUS (xmode,
5217                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
5218                         GEN_INT (low));
5219
5220       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
5221                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
5222                    opnum, (enum reload_type) type);
5223       return x;
5224     }
5225
5226   return NULL_RTX;
5227 }
5228
5229
5230 /* Return the reload icode required for a constant pool in mode.  */
5231 static enum insn_code
5232 aarch64_constant_pool_reload_icode (machine_mode mode)
5233 {
5234   switch (mode)
5235     {
5236     case SFmode:
5237       return CODE_FOR_aarch64_reload_movcpsfdi;
5238
5239     case DFmode:
5240       return CODE_FOR_aarch64_reload_movcpdfdi;
5241
5242     case TFmode:
5243       return CODE_FOR_aarch64_reload_movcptfdi;
5244
5245     case V8QImode:
5246       return CODE_FOR_aarch64_reload_movcpv8qidi;
5247
5248     case V16QImode:
5249       return CODE_FOR_aarch64_reload_movcpv16qidi;
5250
5251     case V4HImode:
5252       return CODE_FOR_aarch64_reload_movcpv4hidi;
5253
5254     case V8HImode:
5255       return CODE_FOR_aarch64_reload_movcpv8hidi;
5256
5257     case V2SImode:
5258       return CODE_FOR_aarch64_reload_movcpv2sidi;
5259
5260     case V4SImode:
5261       return CODE_FOR_aarch64_reload_movcpv4sidi;
5262
5263     case V2DImode:
5264       return CODE_FOR_aarch64_reload_movcpv2didi;
5265
5266     case V2DFmode:
5267       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5268
5269     default:
5270       gcc_unreachable ();
5271     }
5272
5273   gcc_unreachable ();
5274 }
5275 static reg_class_t
5276 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5277                           reg_class_t rclass,
5278                           machine_mode mode,
5279                           secondary_reload_info *sri)
5280 {
5281
5282   /* If we have to disable direct literal pool loads and stores because the
5283      function is too big, then we need a scratch register.  */
5284   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5285       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5286           || targetm.vector_mode_supported_p (GET_MODE (x)))
5287       && aarch64_nopcrelative_literal_loads)
5288     {
5289       sri->icode = aarch64_constant_pool_reload_icode (mode);
5290       return NO_REGS;
5291     }
5292
5293   /* Without the TARGET_SIMD instructions we cannot move a Q register
5294      to a Q register directly.  We need a scratch.  */
5295   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5296       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5297       && reg_class_subset_p (rclass, FP_REGS))
5298     {
5299       if (mode == TFmode)
5300         sri->icode = CODE_FOR_aarch64_reload_movtf;
5301       else if (mode == TImode)
5302         sri->icode = CODE_FOR_aarch64_reload_movti;
5303       return NO_REGS;
5304     }
5305
5306   /* A TFmode or TImode memory access should be handled via an FP_REGS
5307      because AArch64 has richer addressing modes for LDR/STR instructions
5308      than LDP/STP instructions.  */
5309   if (TARGET_FLOAT && rclass == GENERAL_REGS
5310       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5311     return FP_REGS;
5312
5313   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5314       return GENERAL_REGS;
5315
5316   return NO_REGS;
5317 }
5318
5319 static bool
5320 aarch64_can_eliminate (const int from, const int to)
5321 {
5322   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5323      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5324
5325   if (frame_pointer_needed)
5326     {
5327       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5328         return true;
5329       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5330         return false;
5331       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5332           && !cfun->calls_alloca)
5333         return true;
5334       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5335         return true;
5336
5337       return false;
5338     }
5339   else
5340     {
5341       /* If we decided that we didn't need a leaf frame pointer but then used
5342          LR in the function, then we'll want a frame pointer after all, so
5343          prevent this elimination to ensure a frame pointer is used.  */
5344       if (to == STACK_POINTER_REGNUM
5345           && flag_omit_leaf_frame_pointer
5346           && df_regs_ever_live_p (LR_REGNUM))
5347         return false;
5348     }
5349
5350   return true;
5351 }
5352
5353 HOST_WIDE_INT
5354 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5355 {
5356   aarch64_layout_frame ();
5357
5358   if (to == HARD_FRAME_POINTER_REGNUM)
5359     {
5360       if (from == ARG_POINTER_REGNUM)
5361         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
5362
5363       if (from == FRAME_POINTER_REGNUM)
5364         return (cfun->machine->frame.hard_fp_offset
5365                 - cfun->machine->frame.saved_varargs_size);
5366     }
5367
5368   if (to == STACK_POINTER_REGNUM)
5369     {
5370       if (from == FRAME_POINTER_REGNUM)
5371           return (cfun->machine->frame.frame_size
5372                   - cfun->machine->frame.saved_varargs_size);
5373     }
5374
5375   return cfun->machine->frame.frame_size;
5376 }
5377
5378 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5379    previous frame.  */
5380
5381 rtx
5382 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5383 {
5384   if (count != 0)
5385     return const0_rtx;
5386   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5387 }
5388
5389
5390 static void
5391 aarch64_asm_trampoline_template (FILE *f)
5392 {
5393   if (TARGET_ILP32)
5394     {
5395       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5396       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5397     }
5398   else
5399     {
5400       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5401       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5402     }
5403   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5404   assemble_aligned_integer (4, const0_rtx);
5405   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5406   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5407 }
5408
5409 static void
5410 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5411 {
5412   rtx fnaddr, mem, a_tramp;
5413   const int tramp_code_sz = 16;
5414
5415   /* Don't need to copy the trailing D-words, we fill those in below.  */
5416   emit_block_move (m_tramp, assemble_trampoline_template (),
5417                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5418   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5419   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5420   if (GET_MODE (fnaddr) != ptr_mode)
5421     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5422   emit_move_insn (mem, fnaddr);
5423
5424   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5425   emit_move_insn (mem, chain_value);
5426
5427   /* XXX We should really define a "clear_cache" pattern and use
5428      gen_clear_cache().  */
5429   a_tramp = XEXP (m_tramp, 0);
5430   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5431                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5432                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5433                      ptr_mode);
5434 }
5435
5436 static unsigned char
5437 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5438 {
5439   switch (regclass)
5440     {
5441     case CALLER_SAVE_REGS:
5442     case POINTER_REGS:
5443     case GENERAL_REGS:
5444     case ALL_REGS:
5445     case FP_REGS:
5446     case FP_LO_REGS:
5447       return
5448         aarch64_vector_mode_p (mode)
5449           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5450           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5451     case STACK_REG:
5452       return 1;
5453
5454     case NO_REGS:
5455       return 0;
5456
5457     default:
5458       break;
5459     }
5460   gcc_unreachable ();
5461 }
5462
5463 static reg_class_t
5464 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5465 {
5466   if (regclass == POINTER_REGS)
5467     return GENERAL_REGS;
5468
5469   if (regclass == STACK_REG)
5470     {
5471       if (REG_P(x)
5472           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5473           return regclass;
5474
5475       return NO_REGS;
5476     }
5477
5478   /* If it's an integer immediate that MOVI can't handle, then
5479      FP_REGS is not an option, so we return NO_REGS instead.  */
5480   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5481       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5482     return NO_REGS;
5483
5484   /* Register eliminiation can result in a request for
5485      SP+constant->FP_REGS.  We cannot support such operations which
5486      use SP as source and an FP_REG as destination, so reject out
5487      right now.  */
5488   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5489     {
5490       rtx lhs = XEXP (x, 0);
5491
5492       /* Look through a possible SUBREG introduced by ILP32.  */
5493       if (GET_CODE (lhs) == SUBREG)
5494         lhs = SUBREG_REG (lhs);
5495
5496       gcc_assert (REG_P (lhs));
5497       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5498                                       POINTER_REGS));
5499       return NO_REGS;
5500     }
5501
5502   return regclass;
5503 }
5504
5505 void
5506 aarch64_asm_output_labelref (FILE* f, const char *name)
5507 {
5508   asm_fprintf (f, "%U%s", name);
5509 }
5510
5511 static void
5512 aarch64_elf_asm_constructor (rtx symbol, int priority)
5513 {
5514   if (priority == DEFAULT_INIT_PRIORITY)
5515     default_ctor_section_asm_out_constructor (symbol, priority);
5516   else
5517     {
5518       section *s;
5519       char buf[18];
5520       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5521       s = get_section (buf, SECTION_WRITE, NULL);
5522       switch_to_section (s);
5523       assemble_align (POINTER_SIZE);
5524       assemble_aligned_integer (POINTER_BYTES, symbol);
5525     }
5526 }
5527
5528 static void
5529 aarch64_elf_asm_destructor (rtx symbol, int priority)
5530 {
5531   if (priority == DEFAULT_INIT_PRIORITY)
5532     default_dtor_section_asm_out_destructor (symbol, priority);
5533   else
5534     {
5535       section *s;
5536       char buf[18];
5537       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5538       s = get_section (buf, SECTION_WRITE, NULL);
5539       switch_to_section (s);
5540       assemble_align (POINTER_SIZE);
5541       assemble_aligned_integer (POINTER_BYTES, symbol);
5542     }
5543 }
5544
5545 const char*
5546 aarch64_output_casesi (rtx *operands)
5547 {
5548   char buf[100];
5549   char label[100];
5550   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5551   int index;
5552   static const char *const patterns[4][2] =
5553   {
5554     {
5555       "ldrb\t%w3, [%0,%w1,uxtw]",
5556       "add\t%3, %4, %w3, sxtb #2"
5557     },
5558     {
5559       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5560       "add\t%3, %4, %w3, sxth #2"
5561     },
5562     {
5563       "ldr\t%w3, [%0,%w1,uxtw #2]",
5564       "add\t%3, %4, %w3, sxtw #2"
5565     },
5566     /* We assume that DImode is only generated when not optimizing and
5567        that we don't really need 64-bit address offsets.  That would
5568        imply an object file with 8GB of code in a single function!  */
5569     {
5570       "ldr\t%w3, [%0,%w1,uxtw #2]",
5571       "add\t%3, %4, %w3, sxtw #2"
5572     }
5573   };
5574
5575   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5576
5577   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5578
5579   gcc_assert (index >= 0 && index <= 3);
5580
5581   /* Need to implement table size reduction, by chaning the code below.  */
5582   output_asm_insn (patterns[index][0], operands);
5583   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5584   snprintf (buf, sizeof (buf),
5585             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5586   output_asm_insn (buf, operands);
5587   output_asm_insn (patterns[index][1], operands);
5588   output_asm_insn ("br\t%3", operands);
5589   assemble_label (asm_out_file, label);
5590   return "";
5591 }
5592
5593
5594 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5595    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5596    operator.  */
5597
5598 int
5599 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5600 {
5601   if (shift >= 0 && shift <= 3)
5602     {
5603       int size;
5604       for (size = 8; size <= 32; size *= 2)
5605         {
5606           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5607           if (mask == bits << shift)
5608             return size;
5609         }
5610     }
5611   return 0;
5612 }
5613
5614 /* Constant pools are per function only when PC relative
5615    literal loads are true or we are in the large memory
5616    model.  */
5617
5618 static inline bool
5619 aarch64_can_use_per_function_literal_pools_p (void)
5620 {
5621   return (!aarch64_nopcrelative_literal_loads
5622           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5623 }
5624
5625 static bool
5626 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5627 {
5628   /* Fixme:: In an ideal world this would work similar
5629      to the logic in aarch64_select_rtx_section but this
5630      breaks bootstrap in gcc go.  For now we workaround
5631      this by returning false here.  */
5632   return false;
5633 }
5634
5635 /* Select appropriate section for constants depending
5636    on where we place literal pools.  */
5637
5638 static section *
5639 aarch64_select_rtx_section (machine_mode mode,
5640                             rtx x,
5641                             unsigned HOST_WIDE_INT align)
5642 {
5643   if (aarch64_can_use_per_function_literal_pools_p ())
5644     return function_section (current_function_decl);
5645
5646   return default_elf_select_rtx_section (mode, x, align);
5647 }
5648
5649 /* Costs.  */
5650
5651 /* Helper function for rtx cost calculation.  Strip a shift expression
5652    from X.  Returns the inner operand if successful, or the original
5653    expression on failure.  */
5654 static rtx
5655 aarch64_strip_shift (rtx x)
5656 {
5657   rtx op = x;
5658
5659   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5660      we can convert both to ROR during final output.  */
5661   if ((GET_CODE (op) == ASHIFT
5662        || GET_CODE (op) == ASHIFTRT
5663        || GET_CODE (op) == LSHIFTRT
5664        || GET_CODE (op) == ROTATERT
5665        || GET_CODE (op) == ROTATE)
5666       && CONST_INT_P (XEXP (op, 1)))
5667     return XEXP (op, 0);
5668
5669   if (GET_CODE (op) == MULT
5670       && CONST_INT_P (XEXP (op, 1))
5671       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5672     return XEXP (op, 0);
5673
5674   return x;
5675 }
5676
5677 /* Helper function for rtx cost calculation.  Strip an extend
5678    expression from X.  Returns the inner operand if successful, or the
5679    original expression on failure.  We deal with a number of possible
5680    canonicalization variations here.  */
5681 static rtx
5682 aarch64_strip_extend (rtx x)
5683 {
5684   rtx op = x;
5685
5686   /* Zero and sign extraction of a widened value.  */
5687   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5688       && XEXP (op, 2) == const0_rtx
5689       && GET_CODE (XEXP (op, 0)) == MULT
5690       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5691                                          XEXP (op, 1)))
5692     return XEXP (XEXP (op, 0), 0);
5693
5694   /* It can also be represented (for zero-extend) as an AND with an
5695      immediate.  */
5696   if (GET_CODE (op) == AND
5697       && GET_CODE (XEXP (op, 0)) == MULT
5698       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5699       && CONST_INT_P (XEXP (op, 1))
5700       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5701                            INTVAL (XEXP (op, 1))) != 0)
5702     return XEXP (XEXP (op, 0), 0);
5703
5704   /* Now handle extended register, as this may also have an optional
5705      left shift by 1..4.  */
5706   if (GET_CODE (op) == ASHIFT
5707       && CONST_INT_P (XEXP (op, 1))
5708       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5709     op = XEXP (op, 0);
5710
5711   if (GET_CODE (op) == ZERO_EXTEND
5712       || GET_CODE (op) == SIGN_EXTEND)
5713     op = XEXP (op, 0);
5714
5715   if (op != x)
5716     return op;
5717
5718   return x;
5719 }
5720
5721 /* Return true iff CODE is a shift supported in combination
5722    with arithmetic instructions.  */
5723
5724 static bool
5725 aarch64_shift_p (enum rtx_code code)
5726 {
5727   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5728 }
5729
5730 /* Helper function for rtx cost calculation.  Calculate the cost of
5731    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5732    Return the calculated cost of the expression, recursing manually in to
5733    operands where needed.  */
5734
5735 static int
5736 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5737 {
5738   rtx op0, op1;
5739   const struct cpu_cost_table *extra_cost
5740     = aarch64_tune_params.insn_extra_cost;
5741   int cost = 0;
5742   bool compound_p = (outer == PLUS || outer == MINUS);
5743   machine_mode mode = GET_MODE (x);
5744
5745   gcc_checking_assert (code == MULT);
5746
5747   op0 = XEXP (x, 0);
5748   op1 = XEXP (x, 1);
5749
5750   if (VECTOR_MODE_P (mode))
5751     mode = GET_MODE_INNER (mode);
5752
5753   /* Integer multiply/fma.  */
5754   if (GET_MODE_CLASS (mode) == MODE_INT)
5755     {
5756       /* The multiply will be canonicalized as a shift, cost it as such.  */
5757       if (aarch64_shift_p (GET_CODE (x))
5758           || (CONST_INT_P (op1)
5759               && exact_log2 (INTVAL (op1)) > 0))
5760         {
5761           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5762                            || GET_CODE (op0) == SIGN_EXTEND;
5763           if (speed)
5764             {
5765               if (compound_p)
5766                 {
5767                   if (REG_P (op1))
5768                     /* ARITH + shift-by-register.  */
5769                     cost += extra_cost->alu.arith_shift_reg;
5770                   else if (is_extend)
5771                     /* ARITH + extended register.  We don't have a cost field
5772                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5773                     cost += extra_cost->alu.extend_arith;
5774                   else
5775                     /* ARITH + shift-by-immediate.  */
5776                     cost += extra_cost->alu.arith_shift;
5777                 }
5778               else
5779                 /* LSL (immediate).  */
5780                 cost += extra_cost->alu.shift;
5781
5782             }
5783           /* Strip extends as we will have costed them in the case above.  */
5784           if (is_extend)
5785             op0 = aarch64_strip_extend (op0);
5786
5787           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5788
5789           return cost;
5790         }
5791
5792       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5793          compound and let the below cases handle it.  After all, MNEG is a
5794          special-case alias of MSUB.  */
5795       if (GET_CODE (op0) == NEG)
5796         {
5797           op0 = XEXP (op0, 0);
5798           compound_p = true;
5799         }
5800
5801       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5802       if ((GET_CODE (op0) == ZERO_EXTEND
5803            && GET_CODE (op1) == ZERO_EXTEND)
5804           || (GET_CODE (op0) == SIGN_EXTEND
5805               && GET_CODE (op1) == SIGN_EXTEND))
5806         {
5807           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5808           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5809
5810           if (speed)
5811             {
5812               if (compound_p)
5813                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5814                 cost += extra_cost->mult[0].extend_add;
5815               else
5816                 /* MUL/SMULL/UMULL.  */
5817                 cost += extra_cost->mult[0].extend;
5818             }
5819
5820           return cost;
5821         }
5822
5823       /* This is either an integer multiply or a MADD.  In both cases
5824          we want to recurse and cost the operands.  */
5825       cost += rtx_cost (op0, mode, MULT, 0, speed);
5826       cost += rtx_cost (op1, mode, MULT, 1, speed);
5827
5828       if (speed)
5829         {
5830           if (compound_p)
5831             /* MADD/MSUB.  */
5832             cost += extra_cost->mult[mode == DImode].add;
5833           else
5834             /* MUL.  */
5835             cost += extra_cost->mult[mode == DImode].simple;
5836         }
5837
5838       return cost;
5839     }
5840   else
5841     {
5842       if (speed)
5843         {
5844           /* Floating-point FMA/FMUL can also support negations of the
5845              operands, unless the rounding mode is upward or downward in
5846              which case FNMUL is different than FMUL with operand negation.  */
5847           bool neg0 = GET_CODE (op0) == NEG;
5848           bool neg1 = GET_CODE (op1) == NEG;
5849           if (compound_p || !flag_rounding_math || (neg0 && neg1))
5850             {
5851               if (neg0)
5852                 op0 = XEXP (op0, 0);
5853               if (neg1)
5854                 op1 = XEXP (op1, 0);
5855             }
5856
5857           if (compound_p)
5858             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5859             cost += extra_cost->fp[mode == DFmode].fma;
5860           else
5861             /* FMUL/FNMUL.  */
5862             cost += extra_cost->fp[mode == DFmode].mult;
5863         }
5864
5865       cost += rtx_cost (op0, mode, MULT, 0, speed);
5866       cost += rtx_cost (op1, mode, MULT, 1, speed);
5867       return cost;
5868     }
5869 }
5870
5871 static int
5872 aarch64_address_cost (rtx x,
5873                       machine_mode mode,
5874                       addr_space_t as ATTRIBUTE_UNUSED,
5875                       bool speed)
5876 {
5877   enum rtx_code c = GET_CODE (x);
5878   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5879   struct aarch64_address_info info;
5880   int cost = 0;
5881   info.shift = 0;
5882
5883   if (!aarch64_classify_address (&info, x, mode, c, false))
5884     {
5885       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5886         {
5887           /* This is a CONST or SYMBOL ref which will be split
5888              in a different way depending on the code model in use.
5889              Cost it through the generic infrastructure.  */
5890           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5891           /* Divide through by the cost of one instruction to
5892              bring it to the same units as the address costs.  */
5893           cost_symbol_ref /= COSTS_N_INSNS (1);
5894           /* The cost is then the cost of preparing the address,
5895              followed by an immediate (possibly 0) offset.  */
5896           return cost_symbol_ref + addr_cost->imm_offset;
5897         }
5898       else
5899         {
5900           /* This is most likely a jump table from a case
5901              statement.  */
5902           return addr_cost->register_offset;
5903         }
5904     }
5905
5906   switch (info.type)
5907     {
5908       case ADDRESS_LO_SUM:
5909       case ADDRESS_SYMBOLIC:
5910       case ADDRESS_REG_IMM:
5911         cost += addr_cost->imm_offset;
5912         break;
5913
5914       case ADDRESS_REG_WB:
5915         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5916           cost += addr_cost->pre_modify;
5917         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5918           cost += addr_cost->post_modify;
5919         else
5920           gcc_unreachable ();
5921
5922         break;
5923
5924       case ADDRESS_REG_REG:
5925         cost += addr_cost->register_offset;
5926         break;
5927
5928       case ADDRESS_REG_SXTW:
5929         cost += addr_cost->register_sextend;
5930         break;
5931
5932       case ADDRESS_REG_UXTW:
5933         cost += addr_cost->register_zextend;
5934         break;
5935
5936       default:
5937         gcc_unreachable ();
5938     }
5939
5940
5941   if (info.shift > 0)
5942     {
5943       /* For the sake of calculating the cost of the shifted register
5944          component, we can treat same sized modes in the same way.  */
5945       switch (GET_MODE_BITSIZE (mode))
5946         {
5947           case 16:
5948             cost += addr_cost->addr_scale_costs.hi;
5949             break;
5950
5951           case 32:
5952             cost += addr_cost->addr_scale_costs.si;
5953             break;
5954
5955           case 64:
5956             cost += addr_cost->addr_scale_costs.di;
5957             break;
5958
5959           /* We can't tell, or this is a 128-bit vector.  */
5960           default:
5961             cost += addr_cost->addr_scale_costs.ti;
5962             break;
5963         }
5964     }
5965
5966   return cost;
5967 }
5968
5969 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
5970    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
5971    to be taken.  */
5972
5973 int
5974 aarch64_branch_cost (bool speed_p, bool predictable_p)
5975 {
5976   /* When optimizing for speed, use the cost of unpredictable branches.  */
5977   const struct cpu_branch_cost *branch_costs =
5978     aarch64_tune_params.branch_costs;
5979
5980   if (!speed_p || predictable_p)
5981     return branch_costs->predictable;
5982   else
5983     return branch_costs->unpredictable;
5984 }
5985
5986 /* Return true if the RTX X in mode MODE is a zero or sign extract
5987    usable in an ADD or SUB (extended register) instruction.  */
5988 static bool
5989 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5990 {
5991   /* Catch add with a sign extract.
5992      This is add_<optab><mode>_multp2.  */
5993   if (GET_CODE (x) == SIGN_EXTRACT
5994       || GET_CODE (x) == ZERO_EXTRACT)
5995     {
5996       rtx op0 = XEXP (x, 0);
5997       rtx op1 = XEXP (x, 1);
5998       rtx op2 = XEXP (x, 2);
5999
6000       if (GET_CODE (op0) == MULT
6001           && CONST_INT_P (op1)
6002           && op2 == const0_rtx
6003           && CONST_INT_P (XEXP (op0, 1))
6004           && aarch64_is_extend_from_extract (mode,
6005                                              XEXP (op0, 1),
6006                                              op1))
6007         {
6008           return true;
6009         }
6010     }
6011   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6012      No shift.  */
6013   else if (GET_CODE (x) == SIGN_EXTEND
6014            || GET_CODE (x) == ZERO_EXTEND)
6015     return REG_P (XEXP (x, 0));
6016
6017   return false;
6018 }
6019
6020 static bool
6021 aarch64_frint_unspec_p (unsigned int u)
6022 {
6023   switch (u)
6024     {
6025       case UNSPEC_FRINTZ:
6026       case UNSPEC_FRINTP:
6027       case UNSPEC_FRINTM:
6028       case UNSPEC_FRINTA:
6029       case UNSPEC_FRINTN:
6030       case UNSPEC_FRINTX:
6031       case UNSPEC_FRINTI:
6032         return true;
6033
6034       default:
6035         return false;
6036     }
6037 }
6038
6039 /* Return true iff X is an rtx that will match an extr instruction
6040    i.e. as described in the *extr<mode>5_insn family of patterns.
6041    OP0 and OP1 will be set to the operands of the shifts involved
6042    on success and will be NULL_RTX otherwise.  */
6043
6044 static bool
6045 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6046 {
6047   rtx op0, op1;
6048   machine_mode mode = GET_MODE (x);
6049
6050   *res_op0 = NULL_RTX;
6051   *res_op1 = NULL_RTX;
6052
6053   if (GET_CODE (x) != IOR)
6054     return false;
6055
6056   op0 = XEXP (x, 0);
6057   op1 = XEXP (x, 1);
6058
6059   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6060       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6061     {
6062      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6063       if (GET_CODE (op1) == ASHIFT)
6064         std::swap (op0, op1);
6065
6066       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6067         return false;
6068
6069       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6070       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6071
6072       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6073           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6074         {
6075           *res_op0 = XEXP (op0, 0);
6076           *res_op1 = XEXP (op1, 0);
6077           return true;
6078         }
6079     }
6080
6081   return false;
6082 }
6083
6084 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6085    storing it in *COST.  Result is true if the total cost of the operation
6086    has now been calculated.  */
6087 static bool
6088 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6089 {
6090   rtx inner;
6091   rtx comparator;
6092   enum rtx_code cmpcode;
6093
6094   if (COMPARISON_P (op0))
6095     {
6096       inner = XEXP (op0, 0);
6097       comparator = XEXP (op0, 1);
6098       cmpcode = GET_CODE (op0);
6099     }
6100   else
6101     {
6102       inner = op0;
6103       comparator = const0_rtx;
6104       cmpcode = NE;
6105     }
6106
6107   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6108     {
6109       /* Conditional branch.  */
6110       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6111         return true;
6112       else
6113         {
6114           if (cmpcode == NE || cmpcode == EQ)
6115             {
6116               if (comparator == const0_rtx)
6117                 {
6118                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6119                   if (GET_CODE (inner) == ZERO_EXTRACT)
6120                     /* TBZ/TBNZ.  */
6121                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6122                                        ZERO_EXTRACT, 0, speed);
6123                   else
6124                     /* CBZ/CBNZ.  */
6125                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6126
6127                 return true;
6128               }
6129             }
6130           else if (cmpcode == LT || cmpcode == GE)
6131             {
6132               /* TBZ/TBNZ.  */
6133               if (comparator == const0_rtx)
6134                 return true;
6135             }
6136         }
6137     }
6138   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6139     {
6140       /* It's a conditional operation based on the status flags,
6141          so it must be some flavor of CSEL.  */
6142
6143       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6144       if (GET_CODE (op1) == NEG
6145           || GET_CODE (op1) == NOT
6146           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6147         op1 = XEXP (op1, 0);
6148       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6149         {
6150           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6151           op1 = XEXP (op1, 0);
6152           op2 = XEXP (op2, 0);
6153         }
6154
6155       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6156       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6157       return true;
6158     }
6159
6160   /* We don't know what this is, cost all operands.  */
6161   return false;
6162 }
6163
6164 /* Check whether X is a bitfield operation of the form shift + extend that
6165    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6166    operand to which the bitfield operation is applied.  Otherwise return
6167    NULL_RTX.  */
6168
6169 static rtx
6170 aarch64_extend_bitfield_pattern_p (rtx x)
6171 {
6172   rtx_code outer_code = GET_CODE (x);
6173   machine_mode outer_mode = GET_MODE (x);
6174
6175   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6176       && outer_mode != SImode && outer_mode != DImode)
6177     return NULL_RTX;
6178
6179   rtx inner = XEXP (x, 0);
6180   rtx_code inner_code = GET_CODE (inner);
6181   machine_mode inner_mode = GET_MODE (inner);
6182   rtx op = NULL_RTX;
6183
6184   switch (inner_code)
6185     {
6186       case ASHIFT:
6187         if (CONST_INT_P (XEXP (inner, 1))
6188             && (inner_mode == QImode || inner_mode == HImode))
6189           op = XEXP (inner, 0);
6190         break;
6191       case LSHIFTRT:
6192         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6193             && (inner_mode == QImode || inner_mode == HImode))
6194           op = XEXP (inner, 0);
6195         break;
6196       case ASHIFTRT:
6197         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6198             && (inner_mode == QImode || inner_mode == HImode))
6199           op = XEXP (inner, 0);
6200         break;
6201       default:
6202         break;
6203     }
6204
6205   return op;
6206 }
6207
6208 /* Calculate the cost of calculating X, storing it in *COST.  Result
6209    is true if the total cost of the operation has now been calculated.  */
6210 static bool
6211 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6212                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6213 {
6214   rtx op0, op1, op2;
6215   const struct cpu_cost_table *extra_cost
6216     = aarch64_tune_params.insn_extra_cost;
6217   int code = GET_CODE (x);
6218
6219   /* By default, assume that everything has equivalent cost to the
6220      cheapest instruction.  Any additional costs are applied as a delta
6221      above this default.  */
6222   *cost = COSTS_N_INSNS (1);
6223
6224   switch (code)
6225     {
6226     case SET:
6227       /* The cost depends entirely on the operands to SET.  */
6228       *cost = 0;
6229       op0 = SET_DEST (x);
6230       op1 = SET_SRC (x);
6231
6232       switch (GET_CODE (op0))
6233         {
6234         case MEM:
6235           if (speed)
6236             {
6237               rtx address = XEXP (op0, 0);
6238               if (VECTOR_MODE_P (mode))
6239                 *cost += extra_cost->ldst.storev;
6240               else if (GET_MODE_CLASS (mode) == MODE_INT)
6241                 *cost += extra_cost->ldst.store;
6242               else if (mode == SFmode)
6243                 *cost += extra_cost->ldst.storef;
6244               else if (mode == DFmode)
6245                 *cost += extra_cost->ldst.stored;
6246
6247               *cost +=
6248                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6249                                                      0, speed));
6250             }
6251
6252           *cost += rtx_cost (op1, mode, SET, 1, speed);
6253           return true;
6254
6255         case SUBREG:
6256           if (! REG_P (SUBREG_REG (op0)))
6257             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6258
6259           /* Fall through.  */
6260         case REG:
6261           /* The cost is one per vector-register copied.  */
6262           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6263             {
6264               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6265                               / GET_MODE_SIZE (V4SImode);
6266               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6267             }
6268           /* const0_rtx is in general free, but we will use an
6269              instruction to set a register to 0.  */
6270           else if (REG_P (op1) || op1 == const0_rtx)
6271             {
6272               /* The cost is 1 per register copied.  */
6273               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6274                               / UNITS_PER_WORD;
6275               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6276             }
6277           else
6278             /* Cost is just the cost of the RHS of the set.  */
6279             *cost += rtx_cost (op1, mode, SET, 1, speed);
6280           return true;
6281
6282         case ZERO_EXTRACT:
6283         case SIGN_EXTRACT:
6284           /* Bit-field insertion.  Strip any redundant widening of
6285              the RHS to meet the width of the target.  */
6286           if (GET_CODE (op1) == SUBREG)
6287             op1 = SUBREG_REG (op1);
6288           if ((GET_CODE (op1) == ZERO_EXTEND
6289                || GET_CODE (op1) == SIGN_EXTEND)
6290               && CONST_INT_P (XEXP (op0, 1))
6291               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6292                   >= INTVAL (XEXP (op0, 1))))
6293             op1 = XEXP (op1, 0);
6294
6295           if (CONST_INT_P (op1))
6296             {
6297               /* MOV immediate is assumed to always be cheap.  */
6298               *cost = COSTS_N_INSNS (1);
6299             }
6300           else
6301             {
6302               /* BFM.  */
6303               if (speed)
6304                 *cost += extra_cost->alu.bfi;
6305               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6306             }
6307
6308           return true;
6309
6310         default:
6311           /* We can't make sense of this, assume default cost.  */
6312           *cost = COSTS_N_INSNS (1);
6313           return false;
6314         }
6315       return false;
6316
6317     case CONST_INT:
6318       /* If an instruction can incorporate a constant within the
6319          instruction, the instruction's expression avoids calling
6320          rtx_cost() on the constant.  If rtx_cost() is called on a
6321          constant, then it is usually because the constant must be
6322          moved into a register by one or more instructions.
6323
6324          The exception is constant 0, which can be expressed
6325          as XZR/WZR and is therefore free.  The exception to this is
6326          if we have (set (reg) (const0_rtx)) in which case we must cost
6327          the move.  However, we can catch that when we cost the SET, so
6328          we don't need to consider that here.  */
6329       if (x == const0_rtx)
6330         *cost = 0;
6331       else
6332         {
6333           /* To an approximation, building any other constant is
6334              proportionally expensive to the number of instructions
6335              required to build that constant.  This is true whether we
6336              are compiling for SPEED or otherwise.  */
6337           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6338                                  (NULL_RTX, x, false, mode));
6339         }
6340       return true;
6341
6342     case CONST_DOUBLE:
6343       if (speed)
6344         {
6345           /* mov[df,sf]_aarch64.  */
6346           if (aarch64_float_const_representable_p (x))
6347             /* FMOV (scalar immediate).  */
6348             *cost += extra_cost->fp[mode == DFmode].fpconst;
6349           else if (!aarch64_float_const_zero_rtx_p (x))
6350             {
6351               /* This will be a load from memory.  */
6352               if (mode == DFmode)
6353                 *cost += extra_cost->ldst.loadd;
6354               else
6355                 *cost += extra_cost->ldst.loadf;
6356             }
6357           else
6358             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
6359                or MOV v0.s[0], wzr - neither of which are modeled by the
6360                cost tables.  Just use the default cost.  */
6361             {
6362             }
6363         }
6364
6365       return true;
6366
6367     case MEM:
6368       if (speed)
6369         {
6370           /* For loads we want the base cost of a load, plus an
6371              approximation for the additional cost of the addressing
6372              mode.  */
6373           rtx address = XEXP (x, 0);
6374           if (VECTOR_MODE_P (mode))
6375             *cost += extra_cost->ldst.loadv;
6376           else if (GET_MODE_CLASS (mode) == MODE_INT)
6377             *cost += extra_cost->ldst.load;
6378           else if (mode == SFmode)
6379             *cost += extra_cost->ldst.loadf;
6380           else if (mode == DFmode)
6381             *cost += extra_cost->ldst.loadd;
6382
6383           *cost +=
6384                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6385                                                      0, speed));
6386         }
6387
6388       return true;
6389
6390     case NEG:
6391       op0 = XEXP (x, 0);
6392
6393       if (VECTOR_MODE_P (mode))
6394         {
6395           if (speed)
6396             {
6397               /* FNEG.  */
6398               *cost += extra_cost->vect.alu;
6399             }
6400           return false;
6401         }
6402
6403       if (GET_MODE_CLASS (mode) == MODE_INT)
6404         {
6405           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6406               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6407             {
6408               /* CSETM.  */
6409               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6410               return true;
6411             }
6412
6413           /* Cost this as SUB wzr, X.  */
6414           op0 = CONST0_RTX (mode);
6415           op1 = XEXP (x, 0);
6416           goto cost_minus;
6417         }
6418
6419       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6420         {
6421           /* Support (neg(fma...)) as a single instruction only if
6422              sign of zeros is unimportant.  This matches the decision
6423              making in aarch64.md.  */
6424           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6425             {
6426               /* FNMADD.  */
6427               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6428               return true;
6429             }
6430           if (GET_CODE (op0) == MULT)
6431             {
6432               /* FNMUL.  */
6433               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6434               return true;
6435             }
6436           if (speed)
6437             /* FNEG.  */
6438             *cost += extra_cost->fp[mode == DFmode].neg;
6439           return false;
6440         }
6441
6442       return false;
6443
6444     case CLRSB:
6445     case CLZ:
6446       if (speed)
6447         {
6448           if (VECTOR_MODE_P (mode))
6449             *cost += extra_cost->vect.alu;
6450           else
6451             *cost += extra_cost->alu.clz;
6452         }
6453
6454       return false;
6455
6456     case COMPARE:
6457       op0 = XEXP (x, 0);
6458       op1 = XEXP (x, 1);
6459
6460       if (op1 == const0_rtx
6461           && GET_CODE (op0) == AND)
6462         {
6463           x = op0;
6464           mode = GET_MODE (op0);
6465           goto cost_logic;
6466         }
6467
6468       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6469         {
6470           /* TODO: A write to the CC flags possibly costs extra, this
6471              needs encoding in the cost tables.  */
6472
6473           /* CC_ZESWPmode supports zero extend for free.  */
6474           if (mode == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
6475             op0 = XEXP (op0, 0);
6476
6477           mode = GET_MODE (op0);
6478           /* ANDS.  */
6479           if (GET_CODE (op0) == AND)
6480             {
6481               x = op0;
6482               goto cost_logic;
6483             }
6484
6485           if (GET_CODE (op0) == PLUS)
6486             {
6487               /* ADDS (and CMN alias).  */
6488               x = op0;
6489               goto cost_plus;
6490             }
6491
6492           if (GET_CODE (op0) == MINUS)
6493             {
6494               /* SUBS.  */
6495               x = op0;
6496               goto cost_minus;
6497             }
6498
6499           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6500               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6501               && CONST_INT_P (XEXP (op0, 2)))
6502             {
6503               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6504                  Handle it here directly rather than going to cost_logic
6505                  since we know the immediate generated for the TST is valid
6506                  so we can avoid creating an intermediate rtx for it only
6507                  for costing purposes.  */
6508               if (speed)
6509                 *cost += extra_cost->alu.logical;
6510
6511               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6512                                  ZERO_EXTRACT, 0, speed);
6513               return true;
6514             }
6515
6516           if (GET_CODE (op1) == NEG)
6517             {
6518               /* CMN.  */
6519               if (speed)
6520                 *cost += extra_cost->alu.arith;
6521
6522               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6523               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6524               return true;
6525             }
6526
6527           /* CMP.
6528
6529              Compare can freely swap the order of operands, and
6530              canonicalization puts the more complex operation first.
6531              But the integer MINUS logic expects the shift/extend
6532              operation in op1.  */
6533           if (! (REG_P (op0)
6534                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6535           {
6536             op0 = XEXP (x, 1);
6537             op1 = XEXP (x, 0);
6538           }
6539           goto cost_minus;
6540         }
6541
6542       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6543         {
6544           /* FCMP.  */
6545           if (speed)
6546             *cost += extra_cost->fp[mode == DFmode].compare;
6547
6548           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6549             {
6550               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6551               /* FCMP supports constant 0.0 for no extra cost. */
6552               return true;
6553             }
6554           return false;
6555         }
6556
6557       if (VECTOR_MODE_P (mode))
6558         {
6559           /* Vector compare.  */
6560           if (speed)
6561             *cost += extra_cost->vect.alu;
6562
6563           if (aarch64_float_const_zero_rtx_p (op1))
6564             {
6565               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6566                  cost.  */
6567               return true;
6568             }
6569           return false;
6570         }
6571       return false;
6572
6573     case MINUS:
6574       {
6575         op0 = XEXP (x, 0);
6576         op1 = XEXP (x, 1);
6577
6578 cost_minus:
6579         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6580
6581         /* Detect valid immediates.  */
6582         if ((GET_MODE_CLASS (mode) == MODE_INT
6583              || (GET_MODE_CLASS (mode) == MODE_CC
6584                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6585             && CONST_INT_P (op1)
6586             && aarch64_uimm12_shift (INTVAL (op1)))
6587           {
6588             if (speed)
6589               /* SUB(S) (immediate).  */
6590               *cost += extra_cost->alu.arith;
6591             return true;
6592           }
6593
6594         /* Look for SUB (extended register).  */
6595         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6596           {
6597             if (speed)
6598               *cost += extra_cost->alu.extend_arith;
6599
6600             op1 = aarch64_strip_extend (op1);
6601             *cost += rtx_cost (op1, VOIDmode,
6602                                (enum rtx_code) GET_CODE (op1), 0, speed);
6603             return true;
6604           }
6605
6606         rtx new_op1 = aarch64_strip_extend (op1);
6607
6608         /* Cost this as an FMA-alike operation.  */
6609         if ((GET_CODE (new_op1) == MULT
6610              || aarch64_shift_p (GET_CODE (new_op1)))
6611             && code != COMPARE)
6612           {
6613             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6614                                             (enum rtx_code) code,
6615                                             speed);
6616             return true;
6617           }
6618
6619         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6620
6621         if (speed)
6622           {
6623             if (VECTOR_MODE_P (mode))
6624               {
6625                 /* Vector SUB.  */
6626                 *cost += extra_cost->vect.alu;
6627               }
6628             else if (GET_MODE_CLASS (mode) == MODE_INT)
6629               {
6630                 /* SUB(S).  */
6631                 *cost += extra_cost->alu.arith;
6632               }
6633             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6634               {
6635                 /* FSUB.  */
6636                 *cost += extra_cost->fp[mode == DFmode].addsub;
6637               }
6638           }
6639         return true;
6640       }
6641
6642     case PLUS:
6643       {
6644         rtx new_op0;
6645
6646         op0 = XEXP (x, 0);
6647         op1 = XEXP (x, 1);
6648
6649 cost_plus:
6650         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6651             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6652           {
6653             /* CSINC.  */
6654             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6655             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6656             return true;
6657           }
6658
6659         if (GET_MODE_CLASS (mode) == MODE_INT
6660             && CONST_INT_P (op1)
6661             && aarch64_uimm12_shift (INTVAL (op1)))
6662           {
6663             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6664
6665             if (speed)
6666               /* ADD (immediate).  */
6667               *cost += extra_cost->alu.arith;
6668             return true;
6669           }
6670
6671         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6672
6673         /* Look for ADD (extended register).  */
6674         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6675           {
6676             if (speed)
6677               *cost += extra_cost->alu.extend_arith;
6678
6679             op0 = aarch64_strip_extend (op0);
6680             *cost += rtx_cost (op0, VOIDmode,
6681                                (enum rtx_code) GET_CODE (op0), 0, speed);
6682             return true;
6683           }
6684
6685         /* Strip any extend, leave shifts behind as we will
6686            cost them through mult_cost.  */
6687         new_op0 = aarch64_strip_extend (op0);
6688
6689         if (GET_CODE (new_op0) == MULT
6690             || aarch64_shift_p (GET_CODE (new_op0)))
6691           {
6692             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6693                                             speed);
6694             return true;
6695           }
6696
6697         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6698
6699         if (speed)
6700           {
6701             if (VECTOR_MODE_P (mode))
6702               {
6703                 /* Vector ADD.  */
6704                 *cost += extra_cost->vect.alu;
6705               }
6706             else if (GET_MODE_CLASS (mode) == MODE_INT)
6707               {
6708                 /* ADD.  */
6709                 *cost += extra_cost->alu.arith;
6710               }
6711             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6712               {
6713                 /* FADD.  */
6714                 *cost += extra_cost->fp[mode == DFmode].addsub;
6715               }
6716           }
6717         return true;
6718       }
6719
6720     case BSWAP:
6721       *cost = COSTS_N_INSNS (1);
6722
6723       if (speed)
6724         {
6725           if (VECTOR_MODE_P (mode))
6726             *cost += extra_cost->vect.alu;
6727           else
6728             *cost += extra_cost->alu.rev;
6729         }
6730       return false;
6731
6732     case IOR:
6733       if (aarch_rev16_p (x))
6734         {
6735           *cost = COSTS_N_INSNS (1);
6736
6737           if (speed)
6738             {
6739               if (VECTOR_MODE_P (mode))
6740                 *cost += extra_cost->vect.alu;
6741               else
6742                 *cost += extra_cost->alu.rev;
6743             }
6744           return true;
6745         }
6746
6747       if (aarch64_extr_rtx_p (x, &op0, &op1))
6748         {
6749           *cost += rtx_cost (op0, mode, IOR, 0, speed);
6750           *cost += rtx_cost (op1, mode, IOR, 1, speed);
6751           if (speed)
6752             *cost += extra_cost->alu.shift;
6753
6754           return true;
6755         }
6756     /* Fall through.  */
6757     case XOR:
6758     case AND:
6759     cost_logic:
6760       op0 = XEXP (x, 0);
6761       op1 = XEXP (x, 1);
6762
6763       if (VECTOR_MODE_P (mode))
6764         {
6765           if (speed)
6766             *cost += extra_cost->vect.alu;
6767           return true;
6768         }
6769
6770       if (code == AND
6771           && GET_CODE (op0) == MULT
6772           && CONST_INT_P (XEXP (op0, 1))
6773           && CONST_INT_P (op1)
6774           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6775                                INTVAL (op1)) != 0)
6776         {
6777           /* This is a UBFM/SBFM.  */
6778           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6779           if (speed)
6780             *cost += extra_cost->alu.bfx;
6781           return true;
6782         }
6783
6784       if (GET_MODE_CLASS (mode) == MODE_INT)
6785         {
6786           /* We possibly get the immediate for free, this is not
6787              modelled.  */
6788           if (CONST_INT_P (op1)
6789               && aarch64_bitmask_imm (INTVAL (op1), mode))
6790             {
6791               *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6792
6793               if (speed)
6794                 *cost += extra_cost->alu.logical;
6795
6796               return true;
6797             }
6798           else
6799             {
6800               rtx new_op0 = op0;
6801
6802               /* Handle ORN, EON, or BIC.  */
6803               if (GET_CODE (op0) == NOT)
6804                 op0 = XEXP (op0, 0);
6805
6806               new_op0 = aarch64_strip_shift (op0);
6807
6808               /* If we had a shift on op0 then this is a logical-shift-
6809                  by-register/immediate operation.  Otherwise, this is just
6810                  a logical operation.  */
6811               if (speed)
6812                 {
6813                   if (new_op0 != op0)
6814                     {
6815                       /* Shift by immediate.  */
6816                       if (CONST_INT_P (XEXP (op0, 1)))
6817                         *cost += extra_cost->alu.log_shift;
6818                       else
6819                         *cost += extra_cost->alu.log_shift_reg;
6820                     }
6821                   else
6822                     *cost += extra_cost->alu.logical;
6823                 }
6824
6825               /* In both cases we want to cost both operands.  */
6826               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6827               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6828
6829               return true;
6830             }
6831         }
6832       return false;
6833
6834     case NOT:
6835       x = XEXP (x, 0);
6836       op0 = aarch64_strip_shift (x);
6837
6838       if (VECTOR_MODE_P (mode))
6839         {
6840           /* Vector NOT.  */
6841           *cost += extra_cost->vect.alu;
6842           return false;
6843         }
6844
6845       /* MVN-shifted-reg.  */
6846       if (op0 != x)
6847         {
6848           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6849
6850           if (speed)
6851             *cost += extra_cost->alu.log_shift;
6852
6853           return true;
6854         }
6855       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6856          Handle the second form here taking care that 'a' in the above can
6857          be a shift.  */
6858       else if (GET_CODE (op0) == XOR)
6859         {
6860           rtx newop0 = XEXP (op0, 0);
6861           rtx newop1 = XEXP (op0, 1);
6862           rtx op0_stripped = aarch64_strip_shift (newop0);
6863
6864           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6865           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6866
6867           if (speed)
6868             {
6869               if (op0_stripped != newop0)
6870                 *cost += extra_cost->alu.log_shift;
6871               else
6872                 *cost += extra_cost->alu.logical;
6873             }
6874
6875           return true;
6876         }
6877       /* MVN.  */
6878       if (speed)
6879         *cost += extra_cost->alu.logical;
6880
6881       return false;
6882
6883     case ZERO_EXTEND:
6884
6885       op0 = XEXP (x, 0);
6886       /* If a value is written in SI mode, then zero extended to DI
6887          mode, the operation will in general be free as a write to
6888          a 'w' register implicitly zeroes the upper bits of an 'x'
6889          register.  However, if this is
6890
6891            (set (reg) (zero_extend (reg)))
6892
6893          we must cost the explicit register move.  */
6894       if (mode == DImode
6895           && GET_MODE (op0) == SImode
6896           && outer == SET)
6897         {
6898           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6899
6900           if (!op_cost && speed)
6901             /* MOV.  */
6902             *cost += extra_cost->alu.extend;
6903           else
6904             /* Free, the cost is that of the SI mode operation.  */
6905             *cost = op_cost;
6906
6907           return true;
6908         }
6909       else if (MEM_P (op0))
6910         {
6911           /* All loads can zero extend to any size for free.  */
6912           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6913           return true;
6914         }
6915
6916       op0 = aarch64_extend_bitfield_pattern_p (x);
6917       if (op0)
6918         {
6919           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
6920           if (speed)
6921             *cost += extra_cost->alu.bfx;
6922           return true;
6923         }
6924
6925       if (speed)
6926         {
6927           if (VECTOR_MODE_P (mode))
6928             {
6929               /* UMOV.  */
6930               *cost += extra_cost->vect.alu;
6931             }
6932           else
6933             {
6934               /* UXTB/UXTH.  */
6935               *cost += extra_cost->alu.extend;
6936             }
6937         }
6938       return false;
6939
6940     case SIGN_EXTEND:
6941       if (MEM_P (XEXP (x, 0)))
6942         {
6943           /* LDRSH.  */
6944           if (speed)
6945             {
6946               rtx address = XEXP (XEXP (x, 0), 0);
6947               *cost += extra_cost->ldst.load_sign_extend;
6948
6949               *cost +=
6950                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6951                                                      0, speed));
6952             }
6953           return true;
6954         }
6955
6956       op0 = aarch64_extend_bitfield_pattern_p (x);
6957       if (op0)
6958         {
6959           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
6960           if (speed)
6961             *cost += extra_cost->alu.bfx;
6962           return true;
6963         }
6964
6965       if (speed)
6966         {
6967           if (VECTOR_MODE_P (mode))
6968             *cost += extra_cost->vect.alu;
6969           else
6970             *cost += extra_cost->alu.extend;
6971         }
6972       return false;
6973
6974     case ASHIFT:
6975       op0 = XEXP (x, 0);
6976       op1 = XEXP (x, 1);
6977
6978       if (CONST_INT_P (op1))
6979         {
6980           if (speed)
6981             {
6982               if (VECTOR_MODE_P (mode))
6983                 {
6984                   /* Vector shift (immediate).  */
6985                   *cost += extra_cost->vect.alu;
6986                 }
6987               else
6988                 {
6989                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6990                      aliases.  */
6991                   *cost += extra_cost->alu.shift;
6992                 }
6993             }
6994
6995           /* We can incorporate zero/sign extend for free.  */
6996           if (GET_CODE (op0) == ZERO_EXTEND
6997               || GET_CODE (op0) == SIGN_EXTEND)
6998             op0 = XEXP (op0, 0);
6999
7000           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7001           return true;
7002         }
7003       else
7004         {
7005           if (speed)
7006             {
7007               if (VECTOR_MODE_P (mode))
7008                 {
7009                   /* Vector shift (register).  */
7010                   *cost += extra_cost->vect.alu;
7011                 }
7012               else
7013                 {
7014                   /* LSLV.  */
7015                   *cost += extra_cost->alu.shift_reg;
7016                 }
7017             }
7018           return false;  /* All arguments need to be in registers.  */
7019         }
7020
7021     case ROTATE:
7022     case ROTATERT:
7023     case LSHIFTRT:
7024     case ASHIFTRT:
7025       op0 = XEXP (x, 0);
7026       op1 = XEXP (x, 1);
7027
7028       if (CONST_INT_P (op1))
7029         {
7030           /* ASR (immediate) and friends.  */
7031           if (speed)
7032             {
7033               if (VECTOR_MODE_P (mode))
7034                 *cost += extra_cost->vect.alu;
7035               else
7036                 *cost += extra_cost->alu.shift;
7037             }
7038
7039           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7040           return true;
7041         }
7042       else
7043         {
7044
7045           /* ASR (register) and friends.  */
7046           if (speed)
7047             {
7048               if (VECTOR_MODE_P (mode))
7049                 *cost += extra_cost->vect.alu;
7050               else
7051                 *cost += extra_cost->alu.shift_reg;
7052             }
7053           return false;  /* All arguments need to be in registers.  */
7054         }
7055
7056     case SYMBOL_REF:
7057
7058       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7059           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7060         {
7061           /* LDR.  */
7062           if (speed)
7063             *cost += extra_cost->ldst.load;
7064         }
7065       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7066                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7067         {
7068           /* ADRP, followed by ADD.  */
7069           *cost += COSTS_N_INSNS (1);
7070           if (speed)
7071             *cost += 2 * extra_cost->alu.arith;
7072         }
7073       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7074                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7075         {
7076           /* ADR.  */
7077           if (speed)
7078             *cost += extra_cost->alu.arith;
7079         }
7080
7081       if (flag_pic)
7082         {
7083           /* One extra load instruction, after accessing the GOT.  */
7084           *cost += COSTS_N_INSNS (1);
7085           if (speed)
7086             *cost += extra_cost->ldst.load;
7087         }
7088       return true;
7089
7090     case HIGH:
7091     case LO_SUM:
7092       /* ADRP/ADD (immediate).  */
7093       if (speed)
7094         *cost += extra_cost->alu.arith;
7095       return true;
7096
7097     case ZERO_EXTRACT:
7098     case SIGN_EXTRACT:
7099       /* UBFX/SBFX.  */
7100       if (speed)
7101         {
7102           if (VECTOR_MODE_P (mode))
7103             *cost += extra_cost->vect.alu;
7104           else
7105             *cost += extra_cost->alu.bfx;
7106         }
7107
7108       /* We can trust that the immediates used will be correct (there
7109          are no by-register forms), so we need only cost op0.  */
7110       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7111       return true;
7112
7113     case MULT:
7114       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7115       /* aarch64_rtx_mult_cost always handles recursion to its
7116          operands.  */
7117       return true;
7118
7119     case MOD:
7120     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7121        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7122        an unconditional negate.  This case should only ever be reached through
7123        the set_smod_pow2_cheap check in expmed.c.  */
7124       if (CONST_INT_P (XEXP (x, 1))
7125           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7126           && (mode == SImode || mode == DImode))
7127         {
7128           /* We expand to 4 instructions.  Reset the baseline.  */
7129           *cost = COSTS_N_INSNS (4);
7130
7131           if (speed)
7132             *cost += 2 * extra_cost->alu.logical
7133                      + 2 * extra_cost->alu.arith;
7134
7135           return true;
7136         }
7137
7138     /* Fall-through.  */
7139     case UMOD:
7140       if (speed)
7141         {
7142           if (VECTOR_MODE_P (mode))
7143             *cost += extra_cost->vect.alu;
7144           else if (GET_MODE_CLASS (mode) == MODE_INT)
7145             *cost += (extra_cost->mult[mode == DImode].add
7146                       + extra_cost->mult[mode == DImode].idiv);
7147           else if (mode == DFmode)
7148             *cost += (extra_cost->fp[1].mult
7149                       + extra_cost->fp[1].div);
7150           else if (mode == SFmode)
7151             *cost += (extra_cost->fp[0].mult
7152                       + extra_cost->fp[0].div);
7153         }
7154       return false;  /* All arguments need to be in registers.  */
7155
7156     case DIV:
7157     case UDIV:
7158     case SQRT:
7159       if (speed)
7160         {
7161           if (VECTOR_MODE_P (mode))
7162             *cost += extra_cost->vect.alu;
7163           else if (GET_MODE_CLASS (mode) == MODE_INT)
7164             /* There is no integer SQRT, so only DIV and UDIV can get
7165                here.  */
7166             *cost += extra_cost->mult[mode == DImode].idiv;
7167           else
7168             *cost += extra_cost->fp[mode == DFmode].div;
7169         }
7170       return false;  /* All arguments need to be in registers.  */
7171
7172     case IF_THEN_ELSE:
7173       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7174                                          XEXP (x, 2), cost, speed);
7175
7176     case EQ:
7177     case NE:
7178     case GT:
7179     case GTU:
7180     case LT:
7181     case LTU:
7182     case GE:
7183     case GEU:
7184     case LE:
7185     case LEU:
7186
7187       return false; /* All arguments must be in registers.  */
7188
7189     case FMA:
7190       op0 = XEXP (x, 0);
7191       op1 = XEXP (x, 1);
7192       op2 = XEXP (x, 2);
7193
7194       if (speed)
7195         {
7196           if (VECTOR_MODE_P (mode))
7197             *cost += extra_cost->vect.alu;
7198           else
7199             *cost += extra_cost->fp[mode == DFmode].fma;
7200         }
7201
7202       /* FMSUB, FNMADD, and FNMSUB are free.  */
7203       if (GET_CODE (op0) == NEG)
7204         op0 = XEXP (op0, 0);
7205
7206       if (GET_CODE (op2) == NEG)
7207         op2 = XEXP (op2, 0);
7208
7209       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7210          and the by-element operand as operand 0.  */
7211       if (GET_CODE (op1) == NEG)
7212         op1 = XEXP (op1, 0);
7213
7214       /* Catch vector-by-element operations.  The by-element operand can
7215          either be (vec_duplicate (vec_select (x))) or just
7216          (vec_select (x)), depending on whether we are multiplying by
7217          a vector or a scalar.
7218
7219          Canonicalization is not very good in these cases, FMA4 will put the
7220          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7221       if (GET_CODE (op0) == VEC_DUPLICATE)
7222         op0 = XEXP (op0, 0);
7223       else if (GET_CODE (op1) == VEC_DUPLICATE)
7224         op1 = XEXP (op1, 0);
7225
7226       if (GET_CODE (op0) == VEC_SELECT)
7227         op0 = XEXP (op0, 0);
7228       else if (GET_CODE (op1) == VEC_SELECT)
7229         op1 = XEXP (op1, 0);
7230
7231       /* If the remaining parameters are not registers,
7232          get the cost to put them into registers.  */
7233       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7234       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7235       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7236       return true;
7237
7238     case FLOAT:
7239     case UNSIGNED_FLOAT:
7240       if (speed)
7241         *cost += extra_cost->fp[mode == DFmode].fromint;
7242       return false;
7243
7244     case FLOAT_EXTEND:
7245       if (speed)
7246         {
7247           if (VECTOR_MODE_P (mode))
7248             {
7249               /*Vector truncate.  */
7250               *cost += extra_cost->vect.alu;
7251             }
7252           else
7253             *cost += extra_cost->fp[mode == DFmode].widen;
7254         }
7255       return false;
7256
7257     case FLOAT_TRUNCATE:
7258       if (speed)
7259         {
7260           if (VECTOR_MODE_P (mode))
7261             {
7262               /*Vector conversion.  */
7263               *cost += extra_cost->vect.alu;
7264             }
7265           else
7266             *cost += extra_cost->fp[mode == DFmode].narrow;
7267         }
7268       return false;
7269
7270     case FIX:
7271     case UNSIGNED_FIX:
7272       x = XEXP (x, 0);
7273       /* Strip the rounding part.  They will all be implemented
7274          by the fcvt* family of instructions anyway.  */
7275       if (GET_CODE (x) == UNSPEC)
7276         {
7277           unsigned int uns_code = XINT (x, 1);
7278
7279           if (uns_code == UNSPEC_FRINTA
7280               || uns_code == UNSPEC_FRINTM
7281               || uns_code == UNSPEC_FRINTN
7282               || uns_code == UNSPEC_FRINTP
7283               || uns_code == UNSPEC_FRINTZ)
7284             x = XVECEXP (x, 0, 0);
7285         }
7286
7287       if (speed)
7288         {
7289           if (VECTOR_MODE_P (mode))
7290             *cost += extra_cost->vect.alu;
7291           else
7292             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7293         }
7294
7295       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7296          fixed-point fcvt.  */
7297       if (GET_CODE (x) == MULT
7298           && ((VECTOR_MODE_P (mode)
7299                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7300               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7301         {
7302           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7303                              0, speed);
7304           return true;
7305         }
7306
7307       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7308       return true;
7309
7310     case ABS:
7311       if (VECTOR_MODE_P (mode))
7312         {
7313           /* ABS (vector).  */
7314           if (speed)
7315             *cost += extra_cost->vect.alu;
7316         }
7317       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7318         {
7319           op0 = XEXP (x, 0);
7320
7321           /* FABD, which is analogous to FADD.  */
7322           if (GET_CODE (op0) == MINUS)
7323             {
7324               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7325               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7326               if (speed)
7327                 *cost += extra_cost->fp[mode == DFmode].addsub;
7328
7329               return true;
7330             }
7331           /* Simple FABS is analogous to FNEG.  */
7332           if (speed)
7333             *cost += extra_cost->fp[mode == DFmode].neg;
7334         }
7335       else
7336         {
7337           /* Integer ABS will either be split to
7338              two arithmetic instructions, or will be an ABS
7339              (scalar), which we don't model.  */
7340           *cost = COSTS_N_INSNS (2);
7341           if (speed)
7342             *cost += 2 * extra_cost->alu.arith;
7343         }
7344       return false;
7345
7346     case SMAX:
7347     case SMIN:
7348       if (speed)
7349         {
7350           if (VECTOR_MODE_P (mode))
7351             *cost += extra_cost->vect.alu;
7352           else
7353             {
7354               /* FMAXNM/FMINNM/FMAX/FMIN.
7355                  TODO: This may not be accurate for all implementations, but
7356                  we do not model this in the cost tables.  */
7357               *cost += extra_cost->fp[mode == DFmode].addsub;
7358             }
7359         }
7360       return false;
7361
7362     case UNSPEC:
7363       /* The floating point round to integer frint* instructions.  */
7364       if (aarch64_frint_unspec_p (XINT (x, 1)))
7365         {
7366           if (speed)
7367             *cost += extra_cost->fp[mode == DFmode].roundint;
7368
7369           return false;
7370         }
7371
7372       if (XINT (x, 1) == UNSPEC_RBIT)
7373         {
7374           if (speed)
7375             *cost += extra_cost->alu.rev;
7376
7377           return false;
7378         }
7379       break;
7380
7381     case TRUNCATE:
7382
7383       /* Decompose <su>muldi3_highpart.  */
7384       if (/* (truncate:DI  */
7385           mode == DImode
7386           /*   (lshiftrt:TI  */
7387           && GET_MODE (XEXP (x, 0)) == TImode
7388           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7389           /*      (mult:TI  */
7390           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7391           /*        (ANY_EXTEND:TI (reg:DI))
7392                     (ANY_EXTEND:TI (reg:DI)))  */
7393           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7394                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7395               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7396                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7397           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7398           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7399           /*     (const_int 64)  */
7400           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7401           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7402         {
7403           /* UMULH/SMULH.  */
7404           if (speed)
7405             *cost += extra_cost->mult[mode == DImode].extend;
7406           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7407                              mode, MULT, 0, speed);
7408           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7409                              mode, MULT, 1, speed);
7410           return true;
7411         }
7412
7413       /* Fall through.  */
7414     default:
7415       break;
7416     }
7417
7418   if (dump_file && (dump_flags & TDF_DETAILS))
7419     fprintf (dump_file,
7420       "\nFailed to cost RTX.  Assuming default cost.\n");
7421
7422   return true;
7423 }
7424
7425 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7426    calculated for X.  This cost is stored in *COST.  Returns true
7427    if the total cost of X was calculated.  */
7428 static bool
7429 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7430                    int param, int *cost, bool speed)
7431 {
7432   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7433
7434   if (dump_file && (dump_flags & TDF_DETAILS))
7435     {
7436       print_rtl_single (dump_file, x);
7437       fprintf (dump_file, "\n%s cost: %d (%s)\n",
7438                speed ? "Hot" : "Cold",
7439                *cost, result ? "final" : "partial");
7440     }
7441
7442   return result;
7443 }
7444
7445 static int
7446 aarch64_register_move_cost (machine_mode mode,
7447                             reg_class_t from_i, reg_class_t to_i)
7448 {
7449   enum reg_class from = (enum reg_class) from_i;
7450   enum reg_class to = (enum reg_class) to_i;
7451   const struct cpu_regmove_cost *regmove_cost
7452     = aarch64_tune_params.regmove_cost;
7453
7454   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
7455   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7456     to = GENERAL_REGS;
7457
7458   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7459     from = GENERAL_REGS;
7460
7461   /* Moving between GPR and stack cost is the same as GP2GP.  */
7462   if ((from == GENERAL_REGS && to == STACK_REG)
7463       || (to == GENERAL_REGS && from == STACK_REG))
7464     return regmove_cost->GP2GP;
7465
7466   /* To/From the stack register, we move via the gprs.  */
7467   if (to == STACK_REG || from == STACK_REG)
7468     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7469             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7470
7471   if (GET_MODE_SIZE (mode) == 16)
7472     {
7473       /* 128-bit operations on general registers require 2 instructions.  */
7474       if (from == GENERAL_REGS && to == GENERAL_REGS)
7475         return regmove_cost->GP2GP * 2;
7476       else if (from == GENERAL_REGS)
7477         return regmove_cost->GP2FP * 2;
7478       else if (to == GENERAL_REGS)
7479         return regmove_cost->FP2GP * 2;
7480
7481       /* When AdvSIMD instructions are disabled it is not possible to move
7482          a 128-bit value directly between Q registers.  This is handled in
7483          secondary reload.  A general register is used as a scratch to move
7484          the upper DI value and the lower DI value is moved directly,
7485          hence the cost is the sum of three moves. */
7486       if (! TARGET_SIMD)
7487         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7488
7489       return regmove_cost->FP2FP;
7490     }
7491
7492   if (from == GENERAL_REGS && to == GENERAL_REGS)
7493     return regmove_cost->GP2GP;
7494   else if (from == GENERAL_REGS)
7495     return regmove_cost->GP2FP;
7496   else if (to == GENERAL_REGS)
7497     return regmove_cost->FP2GP;
7498
7499   return regmove_cost->FP2FP;
7500 }
7501
7502 static int
7503 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7504                           reg_class_t rclass ATTRIBUTE_UNUSED,
7505                           bool in ATTRIBUTE_UNUSED)
7506 {
7507   return aarch64_tune_params.memmov_cost;
7508 }
7509
7510 /* Return true if it is safe and beneficial to use the rsqrt optabs to
7511    optimize 1.0/sqrt.  */
7512
7513 static bool
7514 use_rsqrt_p (void)
7515 {
7516   return (!flag_trapping_math
7517           && flag_unsafe_math_optimizations
7518           && (aarch64_tune_params.extra_tuning_flags
7519               & AARCH64_EXTRA_TUNE_RECIP_SQRT));
7520 }
7521
7522 /* Function to decide when to use
7523    reciprocal square root builtins.  */
7524
7525 static tree
7526 aarch64_builtin_reciprocal (tree fndecl)
7527 {
7528   if (!use_rsqrt_p ())
7529     return NULL_TREE;
7530   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7531 }
7532
7533 typedef rtx (*rsqrte_type) (rtx, rtx);
7534
7535 /* Select reciprocal square root initial estimate
7536    insn depending on machine mode.  */
7537
7538 rsqrte_type
7539 get_rsqrte_type (machine_mode mode)
7540 {
7541   switch (mode)
7542   {
7543     case DFmode:   return gen_aarch64_rsqrte_df2;
7544     case SFmode:   return gen_aarch64_rsqrte_sf2;
7545     case V2DFmode: return gen_aarch64_rsqrte_v2df2;
7546     case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
7547     case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
7548     default: gcc_unreachable ();
7549   }
7550 }
7551
7552 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7553
7554 /* Select reciprocal square root Newton-Raphson step
7555    insn depending on machine mode.  */
7556
7557 rsqrts_type
7558 get_rsqrts_type (machine_mode mode)
7559 {
7560   switch (mode)
7561   {
7562     case DFmode:   return gen_aarch64_rsqrts_df3;
7563     case SFmode:   return gen_aarch64_rsqrts_sf3;
7564     case V2DFmode: return gen_aarch64_rsqrts_v2df3;
7565     case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
7566     case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
7567     default: gcc_unreachable ();
7568   }
7569 }
7570
7571 /* Emit instruction sequence to compute
7572    reciprocal square root.  Use two Newton-Raphson steps
7573    for single precision and three for double precision.  */
7574
7575 void
7576 aarch64_emit_swrsqrt (rtx dst, rtx src)
7577 {
7578   machine_mode mode = GET_MODE (src);
7579   gcc_assert (
7580     mode == SFmode || mode == V2SFmode || mode == V4SFmode
7581         || mode == DFmode || mode == V2DFmode);
7582
7583   rtx xsrc = gen_reg_rtx (mode);
7584   emit_move_insn (xsrc, src);
7585   rtx x0 = gen_reg_rtx (mode);
7586
7587   emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
7588
7589   bool double_mode = (mode == DFmode || mode == V2DFmode);
7590
7591   int iterations = double_mode ? 3 : 2;
7592
7593   if (flag_mrecip_low_precision_sqrt)
7594     iterations--;
7595
7596   for (int i = 0; i < iterations; ++i)
7597     {
7598       rtx x1 = gen_reg_rtx (mode);
7599       rtx x2 = gen_reg_rtx (mode);
7600       rtx x3 = gen_reg_rtx (mode);
7601       emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
7602
7603       emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
7604
7605       emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
7606       x0 = x1;
7607     }
7608
7609   emit_move_insn (dst, x0);
7610 }
7611
7612 /* Return the number of instructions that can be issued per cycle.  */
7613 static int
7614 aarch64_sched_issue_rate (void)
7615 {
7616   return aarch64_tune_params.issue_rate;
7617 }
7618
7619 static int
7620 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7621 {
7622   int issue_rate = aarch64_sched_issue_rate ();
7623
7624   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7625 }
7626
7627
7628 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7629    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
7630    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
7631
7632 static int
7633 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
7634                                                     int ready_index)
7635 {
7636   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
7637 }
7638
7639
7640 /* Vectorizer cost model target hooks.  */
7641
7642 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
7643 static int
7644 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7645                                     tree vectype,
7646                                     int misalign ATTRIBUTE_UNUSED)
7647 {
7648   unsigned elements;
7649
7650   switch (type_of_cost)
7651     {
7652       case scalar_stmt:
7653         return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7654
7655       case scalar_load:
7656         return aarch64_tune_params.vec_costs->scalar_load_cost;
7657
7658       case scalar_store:
7659         return aarch64_tune_params.vec_costs->scalar_store_cost;
7660
7661       case vector_stmt:
7662         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7663
7664       case vector_load:
7665         return aarch64_tune_params.vec_costs->vec_align_load_cost;
7666
7667       case vector_store:
7668         return aarch64_tune_params.vec_costs->vec_store_cost;
7669
7670       case vec_to_scalar:
7671         return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7672
7673       case scalar_to_vec:
7674         return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7675
7676       case unaligned_load:
7677         return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7678
7679       case unaligned_store:
7680         return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7681
7682       case cond_branch_taken:
7683         return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7684
7685       case cond_branch_not_taken:
7686         return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7687
7688       case vec_perm:
7689       case vec_promote_demote:
7690         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7691
7692       case vec_construct:
7693         elements = TYPE_VECTOR_SUBPARTS (vectype);
7694         return elements / 2 + 1;
7695
7696       default:
7697         gcc_unreachable ();
7698     }
7699 }
7700
7701 /* Implement targetm.vectorize.add_stmt_cost.  */
7702 static unsigned
7703 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7704                        struct _stmt_vec_info *stmt_info, int misalign,
7705                        enum vect_cost_model_location where)
7706 {
7707   unsigned *cost = (unsigned *) data;
7708   unsigned retval = 0;
7709
7710   if (flag_vect_cost_model)
7711     {
7712       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7713       int stmt_cost =
7714             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7715
7716       /* Statements in an inner loop relative to the loop being
7717          vectorized are weighted more heavily.  The value here is
7718          arbitrary and could potentially be improved with analysis.  */
7719       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7720         count *= 50; /*  FIXME  */
7721
7722       retval = (unsigned) (count * stmt_cost);
7723       cost[where] += retval;
7724     }
7725
7726   return retval;
7727 }
7728
7729 static void initialize_aarch64_code_model (struct gcc_options *);
7730
7731 /* Enum describing the various ways that the
7732    aarch64_parse_{arch,tune,cpu,extension} functions can fail.
7733    This way their callers can choose what kind of error to give.  */
7734
7735 enum aarch64_parse_opt_result
7736 {
7737   AARCH64_PARSE_OK,                     /* Parsing was successful.  */
7738   AARCH64_PARSE_MISSING_ARG,            /* Missing argument.  */
7739   AARCH64_PARSE_INVALID_FEATURE,        /* Invalid feature modifier.  */
7740   AARCH64_PARSE_INVALID_ARG             /* Invalid arch, tune, cpu arg.  */
7741 };
7742
7743 /* Parse the architecture extension string STR and update ISA_FLAGS
7744    with the architecture features turned on or off.  Return a
7745    aarch64_parse_opt_result describing the result.  */
7746
7747 static enum aarch64_parse_opt_result
7748 aarch64_parse_extension (char *str, unsigned long *isa_flags)
7749 {
7750   /* The extension string is parsed left to right.  */
7751   const struct aarch64_option_extension *opt = NULL;
7752
7753   /* Flag to say whether we are adding or removing an extension.  */
7754   int adding_ext = -1;
7755
7756   while (str != NULL && *str != 0)
7757     {
7758       char *ext;
7759       size_t len;
7760
7761       str++;
7762       ext = strchr (str, '+');
7763
7764       if (ext != NULL)
7765         len = ext - str;
7766       else
7767         len = strlen (str);
7768
7769       if (len >= 2 && strncmp (str, "no", 2) == 0)
7770         {
7771           adding_ext = 0;
7772           len -= 2;
7773           str += 2;
7774         }
7775       else if (len > 0)
7776         adding_ext = 1;
7777
7778       if (len == 0)
7779         return AARCH64_PARSE_MISSING_ARG;
7780
7781
7782       /* Scan over the extensions table trying to find an exact match.  */
7783       for (opt = all_extensions; opt->name != NULL; opt++)
7784         {
7785           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
7786             {
7787               /* Add or remove the extension.  */
7788               if (adding_ext)
7789                 *isa_flags |= opt->flags_on;
7790               else
7791                 *isa_flags &= ~(opt->flags_off);
7792               break;
7793             }
7794         }
7795
7796       if (opt->name == NULL)
7797         {
7798           /* Extension not found in list.  */
7799           return AARCH64_PARSE_INVALID_FEATURE;
7800         }
7801
7802       str = ext;
7803     };
7804
7805   return AARCH64_PARSE_OK;
7806 }
7807
7808 /* Parse the TO_PARSE string and put the architecture struct that it
7809    selects into RES and the architectural features into ISA_FLAGS.
7810    Return an aarch64_parse_opt_result describing the parse result.
7811    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
7812
7813 static enum aarch64_parse_opt_result
7814 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7815                     unsigned long *isa_flags)
7816 {
7817   char *ext;
7818   const struct processor *arch;
7819   char *str = (char *) alloca (strlen (to_parse) + 1);
7820   size_t len;
7821
7822   strcpy (str, to_parse);
7823
7824   ext = strchr (str, '+');
7825
7826   if (ext != NULL)
7827     len = ext - str;
7828   else
7829     len = strlen (str);
7830
7831   if (len == 0)
7832     return AARCH64_PARSE_MISSING_ARG;
7833
7834
7835   /* Loop through the list of supported ARCHes to find a match.  */
7836   for (arch = all_architectures; arch->name != NULL; arch++)
7837     {
7838       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7839         {
7840           unsigned long isa_temp = arch->flags;
7841
7842           if (ext != NULL)
7843             {
7844               /* TO_PARSE string contains at least one extension.  */
7845               enum aarch64_parse_opt_result ext_res
7846                 = aarch64_parse_extension (ext, &isa_temp);
7847
7848               if (ext_res != AARCH64_PARSE_OK)
7849                 return ext_res;
7850             }
7851           /* Extension parsing was successful.  Confirm the result
7852              arch and ISA flags.  */
7853           *res = arch;
7854           *isa_flags = isa_temp;
7855           return AARCH64_PARSE_OK;
7856         }
7857     }
7858
7859   /* ARCH name not found in list.  */
7860   return AARCH64_PARSE_INVALID_ARG;
7861 }
7862
7863 /* Parse the TO_PARSE string and put the result tuning in RES and the
7864    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
7865    describing the parse result.  If there is an error parsing, RES and
7866    ISA_FLAGS are left unchanged.  */
7867
7868 static enum aarch64_parse_opt_result
7869 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7870                    unsigned long *isa_flags)
7871 {
7872   char *ext;
7873   const struct processor *cpu;
7874   char *str = (char *) alloca (strlen (to_parse) + 1);
7875   size_t len;
7876
7877   strcpy (str, to_parse);
7878
7879   ext = strchr (str, '+');
7880
7881   if (ext != NULL)
7882     len = ext - str;
7883   else
7884     len = strlen (str);
7885
7886   if (len == 0)
7887     return AARCH64_PARSE_MISSING_ARG;
7888
7889
7890   /* Loop through the list of supported CPUs to find a match.  */
7891   for (cpu = all_cores; cpu->name != NULL; cpu++)
7892     {
7893       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7894         {
7895           unsigned long isa_temp = cpu->flags;
7896
7897
7898           if (ext != NULL)
7899             {
7900               /* TO_PARSE string contains at least one extension.  */
7901               enum aarch64_parse_opt_result ext_res
7902                 = aarch64_parse_extension (ext, &isa_temp);
7903
7904               if (ext_res != AARCH64_PARSE_OK)
7905                 return ext_res;
7906             }
7907           /* Extension parsing was successfull.  Confirm the result
7908              cpu and ISA flags.  */
7909           *res = cpu;
7910           *isa_flags = isa_temp;
7911           return AARCH64_PARSE_OK;
7912         }
7913     }
7914
7915   /* CPU name not found in list.  */
7916   return AARCH64_PARSE_INVALID_ARG;
7917 }
7918
7919 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7920    Return an aarch64_parse_opt_result describing the parse result.
7921    If the parsing fails the RES does not change.  */
7922
7923 static enum aarch64_parse_opt_result
7924 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7925 {
7926   const struct processor *cpu;
7927   char *str = (char *) alloca (strlen (to_parse) + 1);
7928
7929   strcpy (str, to_parse);
7930
7931   /* Loop through the list of supported CPUs to find a match.  */
7932   for (cpu = all_cores; cpu->name != NULL; cpu++)
7933     {
7934       if (strcmp (cpu->name, str) == 0)
7935         {
7936           *res = cpu;
7937           return AARCH64_PARSE_OK;
7938         }
7939     }
7940
7941   /* CPU name not found in list.  */
7942   return AARCH64_PARSE_INVALID_ARG;
7943 }
7944
7945 /* Parse TOKEN, which has length LENGTH to see if it is an option
7946    described in FLAG.  If it is, return the index bit for that fusion type.
7947    If not, error (printing OPTION_NAME) and return zero.  */
7948
7949 static unsigned int
7950 aarch64_parse_one_option_token (const char *token,
7951                                 size_t length,
7952                                 const struct aarch64_flag_desc *flag,
7953                                 const char *option_name)
7954 {
7955   for (; flag->name != NULL; flag++)
7956     {
7957       if (length == strlen (flag->name)
7958           && !strncmp (flag->name, token, length))
7959         return flag->flag;
7960     }
7961
7962   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7963   return 0;
7964 }
7965
7966 /* Parse OPTION which is a comma-separated list of flags to enable.
7967    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7968    default state we inherit from the CPU tuning structures.  OPTION_NAME
7969    gives the top-level option we are parsing in the -moverride string,
7970    for use in error messages.  */
7971
7972 static unsigned int
7973 aarch64_parse_boolean_options (const char *option,
7974                                const struct aarch64_flag_desc *flags,
7975                                unsigned int initial_state,
7976                                const char *option_name)
7977 {
7978   const char separator = '.';
7979   const char* specs = option;
7980   const char* ntoken = option;
7981   unsigned int found_flags = initial_state;
7982
7983   while ((ntoken = strchr (specs, separator)))
7984     {
7985       size_t token_length = ntoken - specs;
7986       unsigned token_ops = aarch64_parse_one_option_token (specs,
7987                                                            token_length,
7988                                                            flags,
7989                                                            option_name);
7990       /* If we find "none" (or, for simplicity's sake, an error) anywhere
7991          in the token stream, reset the supported operations.  So:
7992
7993            adrp+add.cmp+branch.none.adrp+add
7994
7995            would have the result of turning on only adrp+add fusion.  */
7996       if (!token_ops)
7997         found_flags = 0;
7998
7999       found_flags |= token_ops;
8000       specs = ++ntoken;
8001     }
8002
8003   /* We ended with a comma, print something.  */
8004   if (!(*specs))
8005     {
8006       error ("%s string ill-formed\n", option_name);
8007       return 0;
8008     }
8009
8010   /* We still have one more token to parse.  */
8011   size_t token_length = strlen (specs);
8012   unsigned token_ops = aarch64_parse_one_option_token (specs,
8013                                                        token_length,
8014                                                        flags,
8015                                                        option_name);
8016    if (!token_ops)
8017      found_flags = 0;
8018
8019   found_flags |= token_ops;
8020   return found_flags;
8021 }
8022
8023 /* Support for overriding instruction fusion.  */
8024
8025 static void
8026 aarch64_parse_fuse_string (const char *fuse_string,
8027                             struct tune_params *tune)
8028 {
8029   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8030                                                      aarch64_fusible_pairs,
8031                                                      tune->fusible_ops,
8032                                                      "fuse=");
8033 }
8034
8035 /* Support for overriding other tuning flags.  */
8036
8037 static void
8038 aarch64_parse_tune_string (const char *tune_string,
8039                             struct tune_params *tune)
8040 {
8041   tune->extra_tuning_flags
8042     = aarch64_parse_boolean_options (tune_string,
8043                                      aarch64_tuning_flags,
8044                                      tune->extra_tuning_flags,
8045                                      "tune=");
8046 }
8047
8048 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8049    we understand.  If it is, extract the option string and handoff to
8050    the appropriate function.  */
8051
8052 void
8053 aarch64_parse_one_override_token (const char* token,
8054                                   size_t length,
8055                                   struct tune_params *tune)
8056 {
8057   const struct aarch64_tuning_override_function *fn
8058     = aarch64_tuning_override_functions;
8059
8060   const char *option_part = strchr (token, '=');
8061   if (!option_part)
8062     {
8063       error ("tuning string missing in option (%s)", token);
8064       return;
8065     }
8066
8067   /* Get the length of the option name.  */
8068   length = option_part - token;
8069   /* Skip the '=' to get to the option string.  */
8070   option_part++;
8071
8072   for (; fn->name != NULL; fn++)
8073     {
8074       if (!strncmp (fn->name, token, length))
8075         {
8076           fn->parse_override (option_part, tune);
8077           return;
8078         }
8079     }
8080
8081   error ("unknown tuning option (%s)",token);
8082   return;
8083 }
8084
8085 /* A checking mechanism for the implementation of the tls size.  */
8086
8087 static void
8088 initialize_aarch64_tls_size (struct gcc_options *opts)
8089 {
8090   if (aarch64_tls_size == 0)
8091     aarch64_tls_size = 24;
8092
8093   switch (opts->x_aarch64_cmodel_var)
8094     {
8095     case AARCH64_CMODEL_TINY:
8096       /* Both the default and maximum TLS size allowed under tiny is 1M which
8097          needs two instructions to address, so we clamp the size to 24.  */
8098       if (aarch64_tls_size > 24)
8099         aarch64_tls_size = 24;
8100       break;
8101     case AARCH64_CMODEL_SMALL:
8102       /* The maximum TLS size allowed under small is 4G.  */
8103       if (aarch64_tls_size > 32)
8104         aarch64_tls_size = 32;
8105       break;
8106     case AARCH64_CMODEL_LARGE:
8107       /* The maximum TLS size allowed under large is 16E.
8108          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8109       if (aarch64_tls_size > 48)
8110         aarch64_tls_size = 48;
8111       break;
8112     default:
8113       gcc_unreachable ();
8114     }
8115
8116   return;
8117 }
8118
8119 /* Parse STRING looking for options in the format:
8120      string     :: option:string
8121      option     :: name=substring
8122      name       :: {a-z}
8123      substring  :: defined by option.  */
8124
8125 static void
8126 aarch64_parse_override_string (const char* input_string,
8127                                struct tune_params* tune)
8128 {
8129   const char separator = ':';
8130   size_t string_length = strlen (input_string) + 1;
8131   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8132   char *string = string_root;
8133   strncpy (string, input_string, string_length);
8134   string[string_length - 1] = '\0';
8135
8136   char* ntoken = string;
8137
8138   while ((ntoken = strchr (string, separator)))
8139     {
8140       size_t token_length = ntoken - string;
8141       /* Make this substring look like a string.  */
8142       *ntoken = '\0';
8143       aarch64_parse_one_override_token (string, token_length, tune);
8144       string = ++ntoken;
8145     }
8146
8147   /* One last option to parse.  */
8148   aarch64_parse_one_override_token (string, strlen (string), tune);
8149   free (string_root);
8150 }
8151
8152
8153 static void
8154 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8155 {
8156   if (opts->x_flag_omit_frame_pointer)
8157     opts->x_flag_omit_leaf_frame_pointer = false;
8158   else if (opts->x_flag_omit_leaf_frame_pointer)
8159     opts->x_flag_omit_frame_pointer = true;
8160
8161   /* If not optimizing for size, set the default
8162      alignment to what the target wants.  */
8163   if (!opts->x_optimize_size)
8164     {
8165       if (opts->x_align_loops <= 0)
8166         opts->x_align_loops = aarch64_tune_params.loop_align;
8167       if (opts->x_align_jumps <= 0)
8168         opts->x_align_jumps = aarch64_tune_params.jump_align;
8169       if (opts->x_align_functions <= 0)
8170         opts->x_align_functions = aarch64_tune_params.function_align;
8171     }
8172
8173   /* If nopcrelative_literal_loads is set on the command line, this
8174      implies that the user asked for PC relative literal loads.  */
8175   if (opts->x_nopcrelative_literal_loads == 1)
8176     aarch64_nopcrelative_literal_loads = false;
8177
8178   /* If it is not set on the command line, we default to no
8179      pc relative literal loads.  */
8180   if (opts->x_nopcrelative_literal_loads == 2)
8181     aarch64_nopcrelative_literal_loads = true;
8182
8183   /* In the tiny memory model it makes no sense
8184      to disallow non PC relative literal pool loads
8185      as many other things will break anyway.  */
8186   if (opts->x_nopcrelative_literal_loads
8187       && (aarch64_cmodel == AARCH64_CMODEL_TINY
8188           || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
8189     aarch64_nopcrelative_literal_loads = false;
8190 }
8191
8192 /* 'Unpack' up the internal tuning structs and update the options
8193     in OPTS.  The caller must have set up selected_tune and selected_arch
8194     as all the other target-specific codegen decisions are
8195     derived from them.  */
8196
8197 void
8198 aarch64_override_options_internal (struct gcc_options *opts)
8199 {
8200   aarch64_tune_flags = selected_tune->flags;
8201   aarch64_tune = selected_tune->sched_core;
8202   /* Make a copy of the tuning parameters attached to the core, which
8203      we may later overwrite.  */
8204   aarch64_tune_params = *(selected_tune->tune);
8205   aarch64_architecture_version = selected_arch->architecture_version;
8206
8207   if (opts->x_aarch64_override_tune_string)
8208     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8209                                   &aarch64_tune_params);
8210
8211   /* This target defaults to strict volatile bitfields.  */
8212   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8213     opts->x_flag_strict_volatile_bitfields = 1;
8214
8215   initialize_aarch64_code_model (opts);
8216   initialize_aarch64_tls_size (opts);
8217
8218   int queue_depth = 0;
8219   switch (aarch64_tune_params.autoprefetcher_model)
8220     {
8221       case tune_params::AUTOPREFETCHER_OFF:
8222         queue_depth = -1;
8223         break;
8224       case tune_params::AUTOPREFETCHER_WEAK:
8225         queue_depth = 0;
8226         break;
8227       case tune_params::AUTOPREFETCHER_STRONG:
8228         queue_depth = max_insn_queue_index + 1;
8229         break;
8230       default:
8231         gcc_unreachable ();
8232     }
8233
8234   /* We don't mind passing in global_options_set here as we don't use
8235      the *options_set structs anyway.  */
8236   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8237                          queue_depth,
8238                          opts->x_param_values,
8239                          global_options_set.x_param_values);
8240
8241   /* Set the L1 cache line size.  */
8242   if (selected_cpu->tune->cache_line_size != 0)
8243     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8244                            selected_cpu->tune->cache_line_size,
8245                            opts->x_param_values,
8246                            global_options_set.x_param_values);
8247
8248   aarch64_override_options_after_change_1 (opts);
8249 }
8250
8251 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
8252    specified in STR and throw errors if appropriate.  Put the results if
8253    they are valid in RES and ISA_FLAGS.  Return whether the option is
8254    valid.  */
8255
8256 static bool
8257 aarch64_validate_mcpu (const char *str, const struct processor **res,
8258                        unsigned long *isa_flags)
8259 {
8260   enum aarch64_parse_opt_result parse_res
8261     = aarch64_parse_cpu (str, res, isa_flags);
8262
8263   if (parse_res == AARCH64_PARSE_OK)
8264     return true;
8265
8266   switch (parse_res)
8267     {
8268       case AARCH64_PARSE_MISSING_ARG:
8269         error ("missing cpu name in -mcpu=%qs", str);
8270         break;
8271       case AARCH64_PARSE_INVALID_ARG:
8272         error ("unknown value %qs for -mcpu", str);
8273         break;
8274       case AARCH64_PARSE_INVALID_FEATURE:
8275         error ("invalid feature modifier in -mcpu=%qs", str);
8276         break;
8277       default:
8278         gcc_unreachable ();
8279     }
8280
8281   return false;
8282 }
8283
8284 /* Validate a command-line -march option.  Parse the arch and extensions
8285    (if any) specified in STR and throw errors if appropriate.  Put the
8286    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
8287    option is valid.  */
8288
8289 static bool
8290 aarch64_validate_march (const char *str, const struct processor **res,
8291                        unsigned long *isa_flags)
8292 {
8293   enum aarch64_parse_opt_result parse_res
8294     = aarch64_parse_arch (str, res, isa_flags);
8295
8296   if (parse_res == AARCH64_PARSE_OK)
8297     return true;
8298
8299   switch (parse_res)
8300     {
8301       case AARCH64_PARSE_MISSING_ARG:
8302         error ("missing arch name in -march=%qs", str);
8303         break;
8304       case AARCH64_PARSE_INVALID_ARG:
8305         error ("unknown value %qs for -march", str);
8306         break;
8307       case AARCH64_PARSE_INVALID_FEATURE:
8308         error ("invalid feature modifier in -march=%qs", str);
8309         break;
8310       default:
8311         gcc_unreachable ();
8312     }
8313
8314   return false;
8315 }
8316
8317 /* Validate a command-line -mtune option.  Parse the cpu
8318    specified in STR and throw errors if appropriate.  Put the
8319    result, if it is valid, in RES.  Return whether the option is
8320    valid.  */
8321
8322 static bool
8323 aarch64_validate_mtune (const char *str, const struct processor **res)
8324 {
8325   enum aarch64_parse_opt_result parse_res
8326     = aarch64_parse_tune (str, res);
8327
8328   if (parse_res == AARCH64_PARSE_OK)
8329     return true;
8330
8331   switch (parse_res)
8332     {
8333       case AARCH64_PARSE_MISSING_ARG:
8334         error ("missing cpu name in -mtune=%qs", str);
8335         break;
8336       case AARCH64_PARSE_INVALID_ARG:
8337         error ("unknown value %qs for -mtune", str);
8338         break;
8339       default:
8340         gcc_unreachable ();
8341     }
8342   return false;
8343 }
8344
8345 /* Return the CPU corresponding to the enum CPU.
8346    If it doesn't specify a cpu, return the default.  */
8347
8348 static const struct processor *
8349 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8350 {
8351   if (cpu != aarch64_none)
8352     return &all_cores[cpu];
8353
8354   /* The & 0x3f is to extract the bottom 6 bits that encode the
8355      default cpu as selected by the --with-cpu GCC configure option
8356      in config.gcc.
8357      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8358      flags mechanism should be reworked to make it more sane.  */
8359   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8360 }
8361
8362 /* Return the architecture corresponding to the enum ARCH.
8363    If it doesn't specify a valid architecture, return the default.  */
8364
8365 static const struct processor *
8366 aarch64_get_arch (enum aarch64_arch arch)
8367 {
8368   if (arch != aarch64_no_arch)
8369     return &all_architectures[arch];
8370
8371   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8372
8373   return &all_architectures[cpu->arch];
8374 }
8375
8376 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
8377    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8378    tuning structs.  In particular it must set selected_tune and
8379    aarch64_isa_flags that define the available ISA features and tuning
8380    decisions.  It must also set selected_arch as this will be used to
8381    output the .arch asm tags for each function.  */
8382
8383 static void
8384 aarch64_override_options (void)
8385 {
8386   unsigned long cpu_isa = 0;
8387   unsigned long arch_isa = 0;
8388   aarch64_isa_flags = 0;
8389
8390   bool valid_cpu = true;
8391   bool valid_tune = true;
8392   bool valid_arch = true;
8393
8394   selected_cpu = NULL;
8395   selected_arch = NULL;
8396   selected_tune = NULL;
8397
8398   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8399      If either of -march or -mtune is given, they override their
8400      respective component of -mcpu.  */
8401   if (aarch64_cpu_string)
8402     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8403                                         &cpu_isa);
8404
8405   if (aarch64_arch_string)
8406     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8407                                           &arch_isa);
8408
8409   if (aarch64_tune_string)
8410     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8411
8412   /* If the user did not specify a processor, choose the default
8413      one for them.  This will be the CPU set during configuration using
8414      --with-cpu, otherwise it is "generic".  */
8415   if (!selected_cpu)
8416     {
8417       if (selected_arch)
8418         {
8419           selected_cpu = &all_cores[selected_arch->ident];
8420           aarch64_isa_flags = arch_isa;
8421           explicit_arch = selected_arch->arch;
8422         }
8423       else
8424         {
8425           /* Get default configure-time CPU.  */
8426           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8427           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8428         }
8429
8430       if (selected_tune)
8431         explicit_tune_core = selected_tune->ident;
8432     }
8433   /* If both -mcpu and -march are specified check that they are architecturally
8434      compatible, warn if they're not and prefer the -march ISA flags.  */
8435   else if (selected_arch)
8436     {
8437       if (selected_arch->arch != selected_cpu->arch)
8438         {
8439           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8440                        all_architectures[selected_cpu->arch].name,
8441                        selected_arch->name);
8442         }
8443       aarch64_isa_flags = arch_isa;
8444       explicit_arch = selected_arch->arch;
8445       explicit_tune_core = selected_tune ? selected_tune->ident
8446                                           : selected_cpu->ident;
8447     }
8448   else
8449     {
8450       /* -mcpu but no -march.  */
8451       aarch64_isa_flags = cpu_isa;
8452       explicit_tune_core = selected_tune ? selected_tune->ident
8453                                           : selected_cpu->ident;
8454       gcc_assert (selected_cpu);
8455       selected_arch = &all_architectures[selected_cpu->arch];
8456       explicit_arch = selected_arch->arch;
8457     }
8458
8459   /* Set the arch as well as we will need it when outputing
8460      the .arch directive in assembly.  */
8461   if (!selected_arch)
8462     {
8463       gcc_assert (selected_cpu);
8464       selected_arch = &all_architectures[selected_cpu->arch];
8465     }
8466
8467   if (!selected_tune)
8468     selected_tune = selected_cpu;
8469
8470 #ifndef HAVE_AS_MABI_OPTION
8471   /* The compiler may have been configured with 2.23.* binutils, which does
8472      not have support for ILP32.  */
8473   if (TARGET_ILP32)
8474     error ("Assembler does not support -mabi=ilp32");
8475 #endif
8476
8477   /* Make sure we properly set up the explicit options.  */
8478   if ((aarch64_cpu_string && valid_cpu)
8479        || (aarch64_tune_string && valid_tune))
8480     gcc_assert (explicit_tune_core != aarch64_none);
8481
8482   if ((aarch64_cpu_string && valid_cpu)
8483        || (aarch64_arch_string && valid_arch))
8484     gcc_assert (explicit_arch != aarch64_no_arch);
8485
8486   aarch64_override_options_internal (&global_options);
8487
8488   /* Save these options as the default ones in case we push and pop them later
8489      while processing functions with potential target attributes.  */
8490   target_option_default_node = target_option_current_node
8491       = build_target_option_node (&global_options);
8492
8493   aarch64_register_fma_steering ();
8494
8495 }
8496
8497 /* Implement targetm.override_options_after_change.  */
8498
8499 static void
8500 aarch64_override_options_after_change (void)
8501 {
8502   aarch64_override_options_after_change_1 (&global_options);
8503 }
8504
8505 static struct machine_function *
8506 aarch64_init_machine_status (void)
8507 {
8508   struct machine_function *machine;
8509   machine = ggc_cleared_alloc<machine_function> ();
8510   return machine;
8511 }
8512
8513 void
8514 aarch64_init_expanders (void)
8515 {
8516   init_machine_status = aarch64_init_machine_status;
8517 }
8518
8519 /* A checking mechanism for the implementation of the various code models.  */
8520 static void
8521 initialize_aarch64_code_model (struct gcc_options *opts)
8522 {
8523    if (opts->x_flag_pic)
8524      {
8525        switch (opts->x_aarch64_cmodel_var)
8526          {
8527          case AARCH64_CMODEL_TINY:
8528            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8529            break;
8530          case AARCH64_CMODEL_SMALL:
8531 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8532            aarch64_cmodel = (flag_pic == 2
8533                              ? AARCH64_CMODEL_SMALL_PIC
8534                              : AARCH64_CMODEL_SMALL_SPIC);
8535 #else
8536            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8537 #endif
8538            break;
8539          case AARCH64_CMODEL_LARGE:
8540            sorry ("code model %qs with -f%s", "large",
8541                   opts->x_flag_pic > 1 ? "PIC" : "pic");
8542            break;
8543          default:
8544            gcc_unreachable ();
8545          }
8546      }
8547    else
8548      aarch64_cmodel = opts->x_aarch64_cmodel_var;
8549 }
8550
8551 /* Implement TARGET_OPTION_SAVE.  */
8552
8553 static void
8554 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8555 {
8556   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8557 }
8558
8559 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
8560    using the information saved in PTR.  */
8561
8562 static void
8563 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8564 {
8565   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8566   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8567   opts->x_explicit_arch = ptr->x_explicit_arch;
8568   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8569   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8570
8571   aarch64_override_options_internal (opts);
8572 }
8573
8574 /* Implement TARGET_OPTION_PRINT.  */
8575
8576 static void
8577 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8578 {
8579   const struct processor *cpu
8580     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8581   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8582   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8583   std::string extension
8584     = aarch64_get_extension_string_for_isa_flags (isa_flags);
8585
8586   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8587   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8588            arch->name, extension.c_str ());
8589 }
8590
8591 static GTY(()) tree aarch64_previous_fndecl;
8592
8593 void
8594 aarch64_reset_previous_fndecl (void)
8595 {
8596   aarch64_previous_fndecl = NULL;
8597 }
8598
8599 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
8600    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8601    of the function, if such exists.  This function may be called multiple
8602    times on a single function so use aarch64_previous_fndecl to avoid
8603    setting up identical state.  */
8604
8605 static void
8606 aarch64_set_current_function (tree fndecl)
8607 {
8608   tree old_tree = (aarch64_previous_fndecl
8609                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
8610                    : NULL_TREE);
8611
8612   tree new_tree = (fndecl
8613                    ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
8614                    : NULL_TREE);
8615
8616
8617   if (fndecl && fndecl != aarch64_previous_fndecl)
8618     {
8619       aarch64_previous_fndecl = fndecl;
8620       if (old_tree == new_tree)
8621         ;
8622
8623       else if (new_tree && new_tree != target_option_default_node)
8624         {
8625           cl_target_option_restore (&global_options,
8626                                     TREE_TARGET_OPTION (new_tree));
8627           if (TREE_TARGET_GLOBALS (new_tree))
8628             restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8629           else
8630             TREE_TARGET_GLOBALS (new_tree)
8631               = save_target_globals_default_opts ();
8632         }
8633
8634       else if (old_tree && old_tree != target_option_default_node)
8635         {
8636           new_tree = target_option_current_node;
8637           cl_target_option_restore (&global_options,
8638                                     TREE_TARGET_OPTION (new_tree));
8639           if (TREE_TARGET_GLOBALS (new_tree))
8640             restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8641           else if (new_tree == target_option_default_node)
8642             restore_target_globals (&default_target_globals);
8643           else
8644             TREE_TARGET_GLOBALS (new_tree)
8645               = save_target_globals_default_opts ();
8646         }
8647     }
8648
8649   if (!fndecl)
8650     return;
8651
8652   /* If we turned on SIMD make sure that any vector parameters are re-laid out
8653      so that they use proper vector modes.  */
8654   if (TARGET_SIMD)
8655     {
8656       tree parms = DECL_ARGUMENTS (fndecl);
8657       for (; parms && parms != void_list_node; parms = TREE_CHAIN (parms))
8658         {
8659           if (TREE_CODE (parms) == PARM_DECL
8660               && VECTOR_TYPE_P (TREE_TYPE (parms))
8661               && DECL_MODE (parms) != TYPE_MODE (TREE_TYPE (parms)))
8662             relayout_decl (parms);
8663         }
8664     }
8665 }
8666
8667 /* Enum describing the various ways we can handle attributes.
8668    In many cases we can reuse the generic option handling machinery.  */
8669
8670 enum aarch64_attr_opt_type
8671 {
8672   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
8673   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
8674   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
8675   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
8676 };
8677
8678 /* All the information needed to handle a target attribute.
8679    NAME is the name of the attribute.
8680    ATTR_TYPE specifies the type of behaviour of the attribute as described
8681    in the definition of enum aarch64_attr_opt_type.
8682    ALLOW_NEG is true if the attribute supports a "no-" form.
8683    HANDLER is the function that takes the attribute string and whether
8684    it is a pragma or attribute and handles the option.  It is needed only
8685    when the ATTR_TYPE is aarch64_attr_custom.
8686    OPT_NUM is the enum specifying the option that the attribute modifies.
8687    This is needed for attributes that mirror the behaviour of a command-line
8688    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8689    aarch64_attr_enum.  */
8690
8691 struct aarch64_attribute_info
8692 {
8693   const char *name;
8694   enum aarch64_attr_opt_type attr_type;
8695   bool allow_neg;
8696   bool (*handler) (const char *, const char *);
8697   enum opt_code opt_num;
8698 };
8699
8700 /* Handle the ARCH_STR argument to the arch= target attribute.
8701    PRAGMA_OR_ATTR is used in potential error messages.  */
8702
8703 static bool
8704 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8705 {
8706   const struct processor *tmp_arch = NULL;
8707   enum aarch64_parse_opt_result parse_res
8708     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8709
8710   if (parse_res == AARCH64_PARSE_OK)
8711     {
8712       gcc_assert (tmp_arch);
8713       selected_arch = tmp_arch;
8714       explicit_arch = selected_arch->arch;
8715       return true;
8716     }
8717
8718   switch (parse_res)
8719     {
8720       case AARCH64_PARSE_MISSING_ARG:
8721         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8722         break;
8723       case AARCH64_PARSE_INVALID_ARG:
8724         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8725         break;
8726       case AARCH64_PARSE_INVALID_FEATURE:
8727         error ("invalid feature modifier %qs for 'arch' target %s",
8728                str, pragma_or_attr);
8729         break;
8730       default:
8731         gcc_unreachable ();
8732     }
8733
8734   return false;
8735 }
8736
8737 /* Handle the argument CPU_STR to the cpu= target attribute.
8738    PRAGMA_OR_ATTR is used in potential error messages.  */
8739
8740 static bool
8741 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8742 {
8743   const struct processor *tmp_cpu = NULL;
8744   enum aarch64_parse_opt_result parse_res
8745     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8746
8747   if (parse_res == AARCH64_PARSE_OK)
8748     {
8749       gcc_assert (tmp_cpu);
8750       selected_tune = tmp_cpu;
8751       explicit_tune_core = selected_tune->ident;
8752
8753       selected_arch = &all_architectures[tmp_cpu->arch];
8754       explicit_arch = selected_arch->arch;
8755       return true;
8756     }
8757
8758   switch (parse_res)
8759     {
8760       case AARCH64_PARSE_MISSING_ARG:
8761         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8762         break;
8763       case AARCH64_PARSE_INVALID_ARG:
8764         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8765         break;
8766       case AARCH64_PARSE_INVALID_FEATURE:
8767         error ("invalid feature modifier %qs for 'cpu' target %s",
8768                str, pragma_or_attr);
8769         break;
8770       default:
8771         gcc_unreachable ();
8772     }
8773
8774   return false;
8775 }
8776
8777 /* Handle the argument STR to the tune= target attribute.
8778    PRAGMA_OR_ATTR is used in potential error messages.  */
8779
8780 static bool
8781 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8782 {
8783   const struct processor *tmp_tune = NULL;
8784   enum aarch64_parse_opt_result parse_res
8785     = aarch64_parse_tune (str, &tmp_tune);
8786
8787   if (parse_res == AARCH64_PARSE_OK)
8788     {
8789       gcc_assert (tmp_tune);
8790       selected_tune = tmp_tune;
8791       explicit_tune_core = selected_tune->ident;
8792       return true;
8793     }
8794
8795   switch (parse_res)
8796     {
8797       case AARCH64_PARSE_INVALID_ARG:
8798         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8799         break;
8800       default:
8801         gcc_unreachable ();
8802     }
8803
8804   return false;
8805 }
8806
8807 /* Parse an architecture extensions target attribute string specified in STR.
8808    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
8809    if successful.  Update aarch64_isa_flags to reflect the ISA features
8810    modified.
8811    PRAGMA_OR_ATTR is used in potential error messages.  */
8812
8813 static bool
8814 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8815 {
8816   enum aarch64_parse_opt_result parse_res;
8817   unsigned long isa_flags = aarch64_isa_flags;
8818
8819   /* We allow "+nothing" in the beginning to clear out all architectural
8820      features if the user wants to handpick specific features.  */
8821   if (strncmp ("+nothing", str, 8) == 0)
8822     {
8823       isa_flags = 0;
8824       str += 8;
8825     }
8826
8827   parse_res = aarch64_parse_extension (str, &isa_flags);
8828
8829   if (parse_res == AARCH64_PARSE_OK)
8830     {
8831       aarch64_isa_flags = isa_flags;
8832       return true;
8833     }
8834
8835   switch (parse_res)
8836     {
8837       case AARCH64_PARSE_MISSING_ARG:
8838         error ("missing feature modifier in target %s %qs",
8839                pragma_or_attr, str);
8840         break;
8841
8842       case AARCH64_PARSE_INVALID_FEATURE:
8843         error ("invalid feature modifier in target %s %qs",
8844                pragma_or_attr, str);
8845         break;
8846
8847       default:
8848         gcc_unreachable ();
8849     }
8850
8851  return false;
8852 }
8853
8854 /* The target attributes that we support.  On top of these we also support just
8855    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
8856    handled explicitly in aarch64_process_one_target_attr.  */
8857
8858 static const struct aarch64_attribute_info aarch64_attributes[] =
8859 {
8860   { "general-regs-only", aarch64_attr_mask, false, NULL,
8861      OPT_mgeneral_regs_only },
8862   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8863      OPT_mfix_cortex_a53_835769 },
8864   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8865   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8866   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8867      OPT_momit_leaf_frame_pointer },
8868   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8869   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
8870      OPT_march_ },
8871   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
8872   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
8873      OPT_mtune_ },
8874   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
8875 };
8876
8877 /* Parse ARG_STR which contains the definition of one target attribute.
8878    Show appropriate errors if any or return true if the attribute is valid.
8879    PRAGMA_OR_ATTR holds the string to use in error messages about whether
8880    we're processing a target attribute or pragma.  */
8881
8882 static bool
8883 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
8884 {
8885   bool invert = false;
8886
8887   size_t len = strlen (arg_str);
8888
8889   if (len == 0)
8890     {
8891       error ("malformed target %s", pragma_or_attr);
8892       return false;
8893     }
8894
8895   char *str_to_check = (char *) alloca (len + 1);
8896   strcpy (str_to_check, arg_str);
8897
8898   /* Skip leading whitespace.  */
8899   while (*str_to_check == ' ' || *str_to_check == '\t')
8900     str_to_check++;
8901
8902   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
8903      It is easier to detect and handle it explicitly here rather than going
8904      through the machinery for the rest of the target attributes in this
8905      function.  */
8906   if (*str_to_check == '+')
8907     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
8908
8909   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
8910     {
8911       invert = true;
8912       str_to_check += 3;
8913     }
8914   char *arg = strchr (str_to_check, '=');
8915
8916   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
8917      and point ARG to "foo".  */
8918   if (arg)
8919     {
8920       *arg = '\0';
8921       arg++;
8922     }
8923   const struct aarch64_attribute_info *p_attr;
8924   bool found = false;
8925   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
8926     {
8927       /* If the names don't match up, or the user has given an argument
8928          to an attribute that doesn't accept one, or didn't give an argument
8929          to an attribute that expects one, fail to match.  */
8930       if (strcmp (str_to_check, p_attr->name) != 0)
8931         continue;
8932
8933       found = true;
8934       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
8935                               || p_attr->attr_type == aarch64_attr_enum;
8936
8937       if (attr_need_arg_p ^ (arg != NULL))
8938         {
8939           error ("target %s %qs does not accept an argument",
8940                   pragma_or_attr, str_to_check);
8941           return false;
8942         }
8943
8944       /* If the name matches but the attribute does not allow "no-" versions
8945          then we can't match.  */
8946       if (invert && !p_attr->allow_neg)
8947         {
8948           error ("target %s %qs does not allow a negated form",
8949                   pragma_or_attr, str_to_check);
8950           return false;
8951         }
8952
8953       switch (p_attr->attr_type)
8954         {
8955         /* Has a custom handler registered.
8956            For example, cpu=, arch=, tune=.  */
8957           case aarch64_attr_custom:
8958             gcc_assert (p_attr->handler);
8959             if (!p_attr->handler (arg, pragma_or_attr))
8960               return false;
8961             break;
8962
8963           /* Either set or unset a boolean option.  */
8964           case aarch64_attr_bool:
8965             {
8966               struct cl_decoded_option decoded;
8967
8968               generate_option (p_attr->opt_num, NULL, !invert,
8969                                CL_TARGET, &decoded);
8970               aarch64_handle_option (&global_options, &global_options_set,
8971                                       &decoded, input_location);
8972               break;
8973             }
8974           /* Set or unset a bit in the target_flags.  aarch64_handle_option
8975              should know what mask to apply given the option number.  */
8976           case aarch64_attr_mask:
8977             {
8978               struct cl_decoded_option decoded;
8979               /* We only need to specify the option number.
8980                  aarch64_handle_option will know which mask to apply.  */
8981               decoded.opt_index = p_attr->opt_num;
8982               decoded.value = !invert;
8983               aarch64_handle_option (&global_options, &global_options_set,
8984                                       &decoded, input_location);
8985               break;
8986             }
8987           /* Use the option setting machinery to set an option to an enum.  */
8988           case aarch64_attr_enum:
8989             {
8990               gcc_assert (arg);
8991               bool valid;
8992               int value;
8993               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
8994                                               &value, CL_TARGET);
8995               if (valid)
8996                 {
8997                   set_option (&global_options, NULL, p_attr->opt_num, value,
8998                               NULL, DK_UNSPECIFIED, input_location,
8999                               global_dc);
9000                 }
9001               else
9002                 {
9003                   error ("target %s %s=%s is not valid",
9004                          pragma_or_attr, str_to_check, arg);
9005                 }
9006               break;
9007             }
9008           default:
9009             gcc_unreachable ();
9010         }
9011     }
9012
9013   /* If we reached here we either have found an attribute and validated
9014      it or didn't match any.  If we matched an attribute but its arguments
9015      were malformed we will have returned false already.  */
9016   return found;
9017 }
9018
9019 /* Count how many times the character C appears in
9020    NULL-terminated string STR.  */
9021
9022 static unsigned int
9023 num_occurences_in_str (char c, char *str)
9024 {
9025   unsigned int res = 0;
9026   while (*str != '\0')
9027     {
9028       if (*str == c)
9029         res++;
9030
9031       str++;
9032     }
9033
9034   return res;
9035 }
9036
9037 /* Parse the tree in ARGS that contains the target attribute information
9038    and update the global target options space.  PRAGMA_OR_ATTR is a string
9039    to be used in error messages, specifying whether this is processing
9040    a target attribute or a target pragma.  */
9041
9042 bool
9043 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9044 {
9045   if (TREE_CODE (args) == TREE_LIST)
9046     {
9047       do
9048         {
9049           tree head = TREE_VALUE (args);
9050           if (head)
9051             {
9052               if (!aarch64_process_target_attr (head, pragma_or_attr))
9053                 return false;
9054             }
9055           args = TREE_CHAIN (args);
9056         } while (args);
9057
9058       return true;
9059     }
9060   /* We expect to find a string to parse.  */
9061   gcc_assert (TREE_CODE (args) == STRING_CST);
9062
9063   size_t len = strlen (TREE_STRING_POINTER (args));
9064   char *str_to_check = (char *) alloca (len + 1);
9065   strcpy (str_to_check, TREE_STRING_POINTER (args));
9066
9067   if (len == 0)
9068     {
9069       error ("malformed target %s value", pragma_or_attr);
9070       return false;
9071     }
9072
9073   /* Used to catch empty spaces between commas i.e.
9074      attribute ((target ("attr1,,attr2"))).  */
9075   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9076
9077   /* Handle multiple target attributes separated by ','.  */
9078   char *token = strtok (str_to_check, ",");
9079
9080   unsigned int num_attrs = 0;
9081   while (token)
9082     {
9083       num_attrs++;
9084       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9085         {
9086           error ("target %s %qs is invalid", pragma_or_attr, token);
9087           return false;
9088         }
9089
9090       token = strtok (NULL, ",");
9091     }
9092
9093   if (num_attrs != num_commas + 1)
9094     {
9095       error ("malformed target %s list %qs",
9096               pragma_or_attr, TREE_STRING_POINTER (args));
9097       return false;
9098     }
9099
9100   return true;
9101 }
9102
9103 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9104    process attribute ((target ("..."))).  */
9105
9106 static bool
9107 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9108 {
9109   struct cl_target_option cur_target;
9110   bool ret;
9111   tree old_optimize;
9112   tree new_target, new_optimize;
9113   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9114
9115   /* If what we're processing is the current pragma string then the
9116      target option node is already stored in target_option_current_node
9117      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9118      having to re-parse the string.  This is especially useful to keep
9119      arm_neon.h compile times down since that header contains a lot
9120      of intrinsics enclosed in pragmas.  */
9121   if (!existing_target && args == current_target_pragma)
9122     {
9123       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9124       return true;
9125     }
9126   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9127
9128   old_optimize = build_optimization_node (&global_options);
9129   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9130
9131   /* If the function changed the optimization levels as well as setting
9132      target options, start with the optimizations specified.  */
9133   if (func_optimize && func_optimize != old_optimize)
9134     cl_optimization_restore (&global_options,
9135                              TREE_OPTIMIZATION (func_optimize));
9136
9137   /* Save the current target options to restore at the end.  */
9138   cl_target_option_save (&cur_target, &global_options);
9139
9140   /* If fndecl already has some target attributes applied to it, unpack
9141      them so that we add this attribute on top of them, rather than
9142      overwriting them.  */
9143   if (existing_target)
9144     {
9145       struct cl_target_option *existing_options
9146         = TREE_TARGET_OPTION (existing_target);
9147
9148       if (existing_options)
9149         cl_target_option_restore (&global_options, existing_options);
9150     }
9151   else
9152     cl_target_option_restore (&global_options,
9153                         TREE_TARGET_OPTION (target_option_current_node));
9154
9155
9156   ret = aarch64_process_target_attr (args, "attribute");
9157
9158   /* Set up any additional state.  */
9159   if (ret)
9160     {
9161       aarch64_override_options_internal (&global_options);
9162       /* Initialize SIMD builtins if we haven't already.
9163          Set current_target_pragma to NULL for the duration so that
9164          the builtin initialization code doesn't try to tag the functions
9165          being built with the attributes specified by any current pragma, thus
9166          going into an infinite recursion.  */
9167       if (TARGET_SIMD)
9168         {
9169           tree saved_current_target_pragma = current_target_pragma;
9170           current_target_pragma = NULL;
9171           aarch64_init_simd_builtins ();
9172           current_target_pragma = saved_current_target_pragma;
9173         }
9174       new_target = build_target_option_node (&global_options);
9175     }
9176   else
9177     new_target = NULL;
9178
9179   new_optimize = build_optimization_node (&global_options);
9180
9181   if (fndecl && ret)
9182     {
9183       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9184
9185       if (old_optimize != new_optimize)
9186         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9187     }
9188
9189   cl_target_option_restore (&global_options, &cur_target);
9190
9191   if (old_optimize != new_optimize)
9192     cl_optimization_restore (&global_options,
9193                              TREE_OPTIMIZATION (old_optimize));
9194   return ret;
9195 }
9196
9197 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
9198    tri-bool options (yes, no, don't care) and the default value is
9199    DEF, determine whether to reject inlining.  */
9200
9201 static bool
9202 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9203                                      int dont_care, int def)
9204 {
9205   /* If the callee doesn't care, always allow inlining.  */
9206   if (callee == dont_care)
9207     return true;
9208
9209   /* If the caller doesn't care, always allow inlining.  */
9210   if (caller == dont_care)
9211     return true;
9212
9213   /* Otherwise, allow inlining if either the callee and caller values
9214      agree, or if the callee is using the default value.  */
9215   return (callee == caller || callee == def);
9216 }
9217
9218 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
9219    to inline CALLEE into CALLER based on target-specific info.
9220    Make sure that the caller and callee have compatible architectural
9221    features.  Then go through the other possible target attributes
9222    and see if they can block inlining.  Try not to reject always_inline
9223    callees unless they are incompatible architecturally.  */
9224
9225 static bool
9226 aarch64_can_inline_p (tree caller, tree callee)
9227 {
9228   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9229   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9230
9231   /* If callee has no option attributes, then it is ok to inline.  */
9232   if (!callee_tree)
9233     return true;
9234
9235   struct cl_target_option *caller_opts
9236         = TREE_TARGET_OPTION (caller_tree ? caller_tree
9237                                            : target_option_default_node);
9238
9239   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9240
9241
9242   /* Callee's ISA flags should be a subset of the caller's.  */
9243   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9244        != callee_opts->x_aarch64_isa_flags)
9245     return false;
9246
9247   /* Allow non-strict aligned functions inlining into strict
9248      aligned ones.  */
9249   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9250        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9251       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9252            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9253     return false;
9254
9255   bool always_inline = lookup_attribute ("always_inline",
9256                                           DECL_ATTRIBUTES (callee));
9257
9258   /* If the architectural features match up and the callee is always_inline
9259      then the other attributes don't matter.  */
9260   if (always_inline)
9261     return true;
9262
9263   if (caller_opts->x_aarch64_cmodel_var
9264       != callee_opts->x_aarch64_cmodel_var)
9265     return false;
9266
9267   if (caller_opts->x_aarch64_tls_dialect
9268       != callee_opts->x_aarch64_tls_dialect)
9269     return false;
9270
9271   /* Honour explicit requests to workaround errata.  */
9272   if (!aarch64_tribools_ok_for_inlining_p (
9273           caller_opts->x_aarch64_fix_a53_err835769,
9274           callee_opts->x_aarch64_fix_a53_err835769,
9275           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9276     return false;
9277
9278   /* If the user explicitly specified -momit-leaf-frame-pointer for the
9279      caller and calle and they don't match up, reject inlining.  */
9280   if (!aarch64_tribools_ok_for_inlining_p (
9281           caller_opts->x_flag_omit_leaf_frame_pointer,
9282           callee_opts->x_flag_omit_leaf_frame_pointer,
9283           2, 1))
9284     return false;
9285
9286   /* If the callee has specific tuning overrides, respect them.  */
9287   if (callee_opts->x_aarch64_override_tune_string != NULL
9288       && caller_opts->x_aarch64_override_tune_string == NULL)
9289     return false;
9290
9291   /* If the user specified tuning override strings for the
9292      caller and callee and they don't match up, reject inlining.
9293      We just do a string compare here, we don't analyze the meaning
9294      of the string, as it would be too costly for little gain.  */
9295   if (callee_opts->x_aarch64_override_tune_string
9296       && caller_opts->x_aarch64_override_tune_string
9297       && (strcmp (callee_opts->x_aarch64_override_tune_string,
9298                   caller_opts->x_aarch64_override_tune_string) != 0))
9299     return false;
9300
9301   return true;
9302 }
9303
9304 /* Return true if SYMBOL_REF X binds locally.  */
9305
9306 static bool
9307 aarch64_symbol_binds_local_p (const_rtx x)
9308 {
9309   return (SYMBOL_REF_DECL (x)
9310           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9311           : SYMBOL_REF_LOCAL_P (x));
9312 }
9313
9314 /* Return true if SYMBOL_REF X is thread local */
9315 static bool
9316 aarch64_tls_symbol_p (rtx x)
9317 {
9318   if (! TARGET_HAVE_TLS)
9319     return false;
9320
9321   if (GET_CODE (x) != SYMBOL_REF)
9322     return false;
9323
9324   return SYMBOL_REF_TLS_MODEL (x) != 0;
9325 }
9326
9327 /* Classify a TLS symbol into one of the TLS kinds.  */
9328 enum aarch64_symbol_type
9329 aarch64_classify_tls_symbol (rtx x)
9330 {
9331   enum tls_model tls_kind = tls_symbolic_operand_type (x);
9332
9333   switch (tls_kind)
9334     {
9335     case TLS_MODEL_GLOBAL_DYNAMIC:
9336     case TLS_MODEL_LOCAL_DYNAMIC:
9337       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9338
9339     case TLS_MODEL_INITIAL_EXEC:
9340       switch (aarch64_cmodel)
9341         {
9342         case AARCH64_CMODEL_TINY:
9343         case AARCH64_CMODEL_TINY_PIC:
9344           return SYMBOL_TINY_TLSIE;
9345         default:
9346           return SYMBOL_SMALL_TLSIE;
9347         }
9348
9349     case TLS_MODEL_LOCAL_EXEC:
9350       if (aarch64_tls_size == 12)
9351         return SYMBOL_TLSLE12;
9352       else if (aarch64_tls_size == 24)
9353         return SYMBOL_TLSLE24;
9354       else if (aarch64_tls_size == 32)
9355         return SYMBOL_TLSLE32;
9356       else if (aarch64_tls_size == 48)
9357         return SYMBOL_TLSLE48;
9358       else
9359         gcc_unreachable ();
9360
9361     case TLS_MODEL_EMULATED:
9362     case TLS_MODEL_NONE:
9363       return SYMBOL_FORCE_TO_MEM;
9364
9365     default:
9366       gcc_unreachable ();
9367     }
9368 }
9369
9370 /* Return the method that should be used to access SYMBOL_REF or
9371    LABEL_REF X.  */
9372
9373 enum aarch64_symbol_type
9374 aarch64_classify_symbol (rtx x, rtx offset)
9375 {
9376   if (GET_CODE (x) == LABEL_REF)
9377     {
9378       switch (aarch64_cmodel)
9379         {
9380         case AARCH64_CMODEL_LARGE:
9381           return SYMBOL_FORCE_TO_MEM;
9382
9383         case AARCH64_CMODEL_TINY_PIC:
9384         case AARCH64_CMODEL_TINY:
9385           return SYMBOL_TINY_ABSOLUTE;
9386
9387         case AARCH64_CMODEL_SMALL_SPIC:
9388         case AARCH64_CMODEL_SMALL_PIC:
9389         case AARCH64_CMODEL_SMALL:
9390           return SYMBOL_SMALL_ABSOLUTE;
9391
9392         default:
9393           gcc_unreachable ();
9394         }
9395     }
9396
9397   if (GET_CODE (x) == SYMBOL_REF)
9398     {
9399       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
9400         {
9401           /* This is alright even in PIC code as the constant
9402              pool reference is always PC relative and within
9403              the same translation unit.  */
9404           if (nopcrelative_literal_loads
9405               && CONSTANT_POOL_ADDRESS_P (x))
9406             return SYMBOL_SMALL_ABSOLUTE;
9407           else
9408             return SYMBOL_FORCE_TO_MEM;
9409         }
9410
9411       if (aarch64_tls_symbol_p (x))
9412         return aarch64_classify_tls_symbol (x);
9413
9414       switch (aarch64_cmodel)
9415         {
9416         case AARCH64_CMODEL_TINY:
9417           /* When we retreive symbol + offset address, we have to make sure
9418              the offset does not cause overflow of the final address.  But
9419              we have no way of knowing the address of symbol at compile time
9420              so we can't accurately say if the distance between the PC and
9421              symbol + offset is outside the addressible range of +/-1M in the
9422              TINY code model.  So we rely on images not being greater than
9423              1M and cap the offset at 1M and anything beyond 1M will have to
9424              be loaded using an alternative mechanism.  */
9425           if (SYMBOL_REF_WEAK (x)
9426               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9427             return SYMBOL_FORCE_TO_MEM;
9428           return SYMBOL_TINY_ABSOLUTE;
9429
9430         case AARCH64_CMODEL_SMALL:
9431           /* Same reasoning as the tiny code model, but the offset cap here is
9432              4G.  */
9433           if (SYMBOL_REF_WEAK (x)
9434               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9435                             HOST_WIDE_INT_C (4294967264)))
9436             return SYMBOL_FORCE_TO_MEM;
9437           return SYMBOL_SMALL_ABSOLUTE;
9438
9439         case AARCH64_CMODEL_TINY_PIC:
9440           if (!aarch64_symbol_binds_local_p (x))
9441             return SYMBOL_TINY_GOT;
9442           return SYMBOL_TINY_ABSOLUTE;
9443
9444         case AARCH64_CMODEL_SMALL_SPIC:
9445         case AARCH64_CMODEL_SMALL_PIC:
9446           if (!aarch64_symbol_binds_local_p (x))
9447             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9448                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9449           return SYMBOL_SMALL_ABSOLUTE;
9450
9451         default:
9452           gcc_unreachable ();
9453         }
9454     }
9455
9456   /* By default push everything into the constant pool.  */
9457   return SYMBOL_FORCE_TO_MEM;
9458 }
9459
9460 bool
9461 aarch64_constant_address_p (rtx x)
9462 {
9463   return (CONSTANT_P (x) && memory_address_p (DImode, x));
9464 }
9465
9466 bool
9467 aarch64_legitimate_pic_operand_p (rtx x)
9468 {
9469   if (GET_CODE (x) == SYMBOL_REF
9470       || (GET_CODE (x) == CONST
9471           && GET_CODE (XEXP (x, 0)) == PLUS
9472           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9473      return false;
9474
9475   return true;
9476 }
9477
9478 /* Return true if X holds either a quarter-precision or
9479      floating-point +0.0 constant.  */
9480 static bool
9481 aarch64_valid_floating_const (machine_mode mode, rtx x)
9482 {
9483   if (!CONST_DOUBLE_P (x))
9484     return false;
9485
9486   if (aarch64_float_const_zero_rtx_p (x))
9487     return true;
9488
9489   /* We only handle moving 0.0 to a TFmode register.  */
9490   if (!(mode == SFmode || mode == DFmode))
9491     return false;
9492
9493   return aarch64_float_const_representable_p (x);
9494 }
9495
9496 static bool
9497 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9498 {
9499   /* Do not allow vector struct mode constants.  We could support
9500      0 and -1 easily, but they need support in aarch64-simd.md.  */
9501   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9502     return false;
9503
9504   /* This could probably go away because
9505      we now decompose CONST_INTs according to expand_mov_immediate.  */
9506   if ((GET_CODE (x) == CONST_VECTOR
9507        && aarch64_simd_valid_immediate (x, mode, false, NULL))
9508       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9509         return !targetm.cannot_force_const_mem (mode, x);
9510
9511   if (GET_CODE (x) == HIGH
9512       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9513     return true;
9514
9515   return aarch64_constant_address_p (x);
9516 }
9517
9518 rtx
9519 aarch64_load_tp (rtx target)
9520 {
9521   if (!target
9522       || GET_MODE (target) != Pmode
9523       || !register_operand (target, Pmode))
9524     target = gen_reg_rtx (Pmode);
9525
9526   /* Can return in any reg.  */
9527   emit_insn (gen_aarch64_load_tp_hard (target));
9528   return target;
9529 }
9530
9531 /* On AAPCS systems, this is the "struct __va_list".  */
9532 static GTY(()) tree va_list_type;
9533
9534 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9535    Return the type to use as __builtin_va_list.
9536
9537    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9538
9539    struct __va_list
9540    {
9541      void *__stack;
9542      void *__gr_top;
9543      void *__vr_top;
9544      int   __gr_offs;
9545      int   __vr_offs;
9546    };  */
9547
9548 static tree
9549 aarch64_build_builtin_va_list (void)
9550 {
9551   tree va_list_name;
9552   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9553
9554   /* Create the type.  */
9555   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9556   /* Give it the required name.  */
9557   va_list_name = build_decl (BUILTINS_LOCATION,
9558                              TYPE_DECL,
9559                              get_identifier ("__va_list"),
9560                              va_list_type);
9561   DECL_ARTIFICIAL (va_list_name) = 1;
9562   TYPE_NAME (va_list_type) = va_list_name;
9563   TYPE_STUB_DECL (va_list_type) = va_list_name;
9564
9565   /* Create the fields.  */
9566   f_stack = build_decl (BUILTINS_LOCATION,
9567                         FIELD_DECL, get_identifier ("__stack"),
9568                         ptr_type_node);
9569   f_grtop = build_decl (BUILTINS_LOCATION,
9570                         FIELD_DECL, get_identifier ("__gr_top"),
9571                         ptr_type_node);
9572   f_vrtop = build_decl (BUILTINS_LOCATION,
9573                         FIELD_DECL, get_identifier ("__vr_top"),
9574                         ptr_type_node);
9575   f_groff = build_decl (BUILTINS_LOCATION,
9576                         FIELD_DECL, get_identifier ("__gr_offs"),
9577                         integer_type_node);
9578   f_vroff = build_decl (BUILTINS_LOCATION,
9579                         FIELD_DECL, get_identifier ("__vr_offs"),
9580                         integer_type_node);
9581
9582   DECL_ARTIFICIAL (f_stack) = 1;
9583   DECL_ARTIFICIAL (f_grtop) = 1;
9584   DECL_ARTIFICIAL (f_vrtop) = 1;
9585   DECL_ARTIFICIAL (f_groff) = 1;
9586   DECL_ARTIFICIAL (f_vroff) = 1;
9587
9588   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9589   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9590   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9591   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9592   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9593
9594   TYPE_FIELDS (va_list_type) = f_stack;
9595   DECL_CHAIN (f_stack) = f_grtop;
9596   DECL_CHAIN (f_grtop) = f_vrtop;
9597   DECL_CHAIN (f_vrtop) = f_groff;
9598   DECL_CHAIN (f_groff) = f_vroff;
9599
9600   /* Compute its layout.  */
9601   layout_type (va_list_type);
9602
9603   return va_list_type;
9604 }
9605
9606 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
9607 static void
9608 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
9609 {
9610   const CUMULATIVE_ARGS *cum;
9611   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9612   tree stack, grtop, vrtop, groff, vroff;
9613   tree t;
9614   int gr_save_area_size;
9615   int vr_save_area_size;
9616   int vr_offset;
9617
9618   cum = &crtl->args.info;
9619   gr_save_area_size
9620     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
9621   vr_save_area_size
9622     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
9623
9624   if (!TARGET_FLOAT)
9625     {
9626       gcc_assert (cum->aapcs_nvrn == 0);
9627       vr_save_area_size = 0;
9628     }
9629
9630   f_stack = TYPE_FIELDS (va_list_type_node);
9631   f_grtop = DECL_CHAIN (f_stack);
9632   f_vrtop = DECL_CHAIN (f_grtop);
9633   f_groff = DECL_CHAIN (f_vrtop);
9634   f_vroff = DECL_CHAIN (f_groff);
9635
9636   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
9637                   NULL_TREE);
9638   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
9639                   NULL_TREE);
9640   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
9641                   NULL_TREE);
9642   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
9643                   NULL_TREE);
9644   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
9645                   NULL_TREE);
9646
9647   /* Emit code to initialize STACK, which points to the next varargs stack
9648      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
9649      by named arguments.  STACK is 8-byte aligned.  */
9650   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
9651   if (cum->aapcs_stack_size > 0)
9652     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
9653   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
9654   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9655
9656   /* Emit code to initialize GRTOP, the top of the GR save area.
9657      virtual_incoming_args_rtx should have been 16 byte aligned.  */
9658   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
9659   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
9660   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9661
9662   /* Emit code to initialize VRTOP, the top of the VR save area.
9663      This address is gr_save_area_bytes below GRTOP, rounded
9664      down to the next 16-byte boundary.  */
9665   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
9666   vr_offset = ROUND_UP (gr_save_area_size,
9667                         STACK_BOUNDARY / BITS_PER_UNIT);
9668
9669   if (vr_offset)
9670     t = fold_build_pointer_plus_hwi (t, -vr_offset);
9671   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
9672   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9673
9674   /* Emit code to initialize GROFF, the offset from GRTOP of the
9675      next GPR argument.  */
9676   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
9677               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
9678   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9679
9680   /* Likewise emit code to initialize VROFF, the offset from FTOP
9681      of the next VR argument.  */
9682   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
9683               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
9684   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9685 }
9686
9687 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
9688
9689 static tree
9690 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
9691                               gimple_seq *post_p ATTRIBUTE_UNUSED)
9692 {
9693   tree addr;
9694   bool indirect_p;
9695   bool is_ha;           /* is HFA or HVA.  */
9696   bool dw_align;        /* double-word align.  */
9697   machine_mode ag_mode = VOIDmode;
9698   int nregs;
9699   machine_mode mode;
9700
9701   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9702   tree stack, f_top, f_off, off, arg, roundup, on_stack;
9703   HOST_WIDE_INT size, rsize, adjust, align;
9704   tree t, u, cond1, cond2;
9705
9706   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9707   if (indirect_p)
9708     type = build_pointer_type (type);
9709
9710   mode = TYPE_MODE (type);
9711
9712   f_stack = TYPE_FIELDS (va_list_type_node);
9713   f_grtop = DECL_CHAIN (f_stack);
9714   f_vrtop = DECL_CHAIN (f_grtop);
9715   f_groff = DECL_CHAIN (f_vrtop);
9716   f_vroff = DECL_CHAIN (f_groff);
9717
9718   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
9719                   f_stack, NULL_TREE);
9720   size = int_size_in_bytes (type);
9721   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9722
9723   dw_align = false;
9724   adjust = 0;
9725   if (aarch64_vfp_is_call_or_return_candidate (mode,
9726                                                type,
9727                                                &ag_mode,
9728                                                &nregs,
9729                                                &is_ha))
9730     {
9731       /* TYPE passed in fp/simd registers.  */
9732       if (!TARGET_FLOAT)
9733         aarch64_err_no_fpadvsimd (mode, "varargs");
9734
9735       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9736                       unshare_expr (valist), f_vrtop, NULL_TREE);
9737       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9738                       unshare_expr (valist), f_vroff, NULL_TREE);
9739
9740       rsize = nregs * UNITS_PER_VREG;
9741
9742       if (is_ha)
9743         {
9744           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9745             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9746         }
9747       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9748                && size < UNITS_PER_VREG)
9749         {
9750           adjust = UNITS_PER_VREG - size;
9751         }
9752     }
9753   else
9754     {
9755       /* TYPE passed in general registers.  */
9756       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9757                       unshare_expr (valist), f_grtop, NULL_TREE);
9758       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9759                       unshare_expr (valist), f_groff, NULL_TREE);
9760       rsize = ROUND_UP (size, UNITS_PER_WORD);
9761       nregs = rsize / UNITS_PER_WORD;
9762
9763       if (align > 8)
9764         dw_align = true;
9765
9766       if (BLOCK_REG_PADDING (mode, type, 1) == downward
9767           && size < UNITS_PER_WORD)
9768         {
9769           adjust = UNITS_PER_WORD  - size;
9770         }
9771     }
9772
9773   /* Get a local temporary for the field value.  */
9774   off = get_initialized_tmp_var (f_off, pre_p, NULL);
9775
9776   /* Emit code to branch if off >= 0.  */
9777   t = build2 (GE_EXPR, boolean_type_node, off,
9778               build_int_cst (TREE_TYPE (off), 0));
9779   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9780
9781   if (dw_align)
9782     {
9783       /* Emit: offs = (offs + 15) & -16.  */
9784       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9785                   build_int_cst (TREE_TYPE (off), 15));
9786       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9787                   build_int_cst (TREE_TYPE (off), -16));
9788       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9789     }
9790   else
9791     roundup = NULL;
9792
9793   /* Update ap.__[g|v]r_offs  */
9794   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9795               build_int_cst (TREE_TYPE (off), rsize));
9796   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9797
9798   /* String up.  */
9799   if (roundup)
9800     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9801
9802   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
9803   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9804               build_int_cst (TREE_TYPE (f_off), 0));
9805   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9806
9807   /* String up: make sure the assignment happens before the use.  */
9808   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9809   COND_EXPR_ELSE (cond1) = t;
9810
9811   /* Prepare the trees handling the argument that is passed on the stack;
9812      the top level node will store in ON_STACK.  */
9813   arg = get_initialized_tmp_var (stack, pre_p, NULL);
9814   if (align > 8)
9815     {
9816       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
9817       t = fold_convert (intDI_type_node, arg);
9818       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9819                   build_int_cst (TREE_TYPE (t), 15));
9820       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9821                   build_int_cst (TREE_TYPE (t), -16));
9822       t = fold_convert (TREE_TYPE (arg), t);
9823       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9824     }
9825   else
9826     roundup = NULL;
9827   /* Advance ap.__stack  */
9828   t = fold_convert (intDI_type_node, arg);
9829   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9830               build_int_cst (TREE_TYPE (t), size + 7));
9831   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9832               build_int_cst (TREE_TYPE (t), -8));
9833   t = fold_convert (TREE_TYPE (arg), t);
9834   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9835   /* String up roundup and advance.  */
9836   if (roundup)
9837     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9838   /* String up with arg */
9839   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9840   /* Big-endianness related address adjustment.  */
9841   if (BLOCK_REG_PADDING (mode, type, 1) == downward
9842       && size < UNITS_PER_WORD)
9843   {
9844     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9845                 size_int (UNITS_PER_WORD - size));
9846     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9847   }
9848
9849   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9850   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9851
9852   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
9853   t = off;
9854   if (adjust)
9855     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
9856                 build_int_cst (TREE_TYPE (off), adjust));
9857
9858   t = fold_convert (sizetype, t);
9859   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
9860
9861   if (is_ha)
9862     {
9863       /* type ha; // treat as "struct {ftype field[n];}"
9864          ... [computing offs]
9865          for (i = 0; i <nregs; ++i, offs += 16)
9866            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
9867          return ha;  */
9868       int i;
9869       tree tmp_ha, field_t, field_ptr_t;
9870
9871       /* Declare a local variable.  */
9872       tmp_ha = create_tmp_var_raw (type, "ha");
9873       gimple_add_tmp_var (tmp_ha);
9874
9875       /* Establish the base type.  */
9876       switch (ag_mode)
9877         {
9878         case SFmode:
9879           field_t = float_type_node;
9880           field_ptr_t = float_ptr_type_node;
9881           break;
9882         case DFmode:
9883           field_t = double_type_node;
9884           field_ptr_t = double_ptr_type_node;
9885           break;
9886         case TFmode:
9887           field_t = long_double_type_node;
9888           field_ptr_t = long_double_ptr_type_node;
9889           break;
9890 /* The half precision and quad precision are not fully supported yet.  Enable
9891    the following code after the support is complete.  Need to find the correct
9892    type node for __fp16 *.  */
9893 #if 0
9894         case HFmode:
9895           field_t = float_type_node;
9896           field_ptr_t = float_ptr_type_node;
9897           break;
9898 #endif
9899         case V2SImode:
9900         case V4SImode:
9901             {
9902               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
9903               field_t = build_vector_type_for_mode (innertype, ag_mode);
9904               field_ptr_t = build_pointer_type (field_t);
9905             }
9906           break;
9907         default:
9908           gcc_assert (0);
9909         }
9910
9911       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
9912       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
9913       addr = t;
9914       t = fold_convert (field_ptr_t, addr);
9915       t = build2 (MODIFY_EXPR, field_t,
9916                   build1 (INDIRECT_REF, field_t, tmp_ha),
9917                   build1 (INDIRECT_REF, field_t, t));
9918
9919       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
9920       for (i = 1; i < nregs; ++i)
9921         {
9922           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
9923           u = fold_convert (field_ptr_t, addr);
9924           u = build2 (MODIFY_EXPR, field_t,
9925                       build2 (MEM_REF, field_t, tmp_ha,
9926                               build_int_cst (field_ptr_t,
9927                                              (i *
9928                                               int_size_in_bytes (field_t)))),
9929                       build1 (INDIRECT_REF, field_t, u));
9930           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
9931         }
9932
9933       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
9934       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
9935     }
9936
9937   COND_EXPR_ELSE (cond2) = t;
9938   addr = fold_convert (build_pointer_type (type), cond1);
9939   addr = build_va_arg_indirect_ref (addr);
9940
9941   if (indirect_p)
9942     addr = build_va_arg_indirect_ref (addr);
9943
9944   return addr;
9945 }
9946
9947 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
9948
9949 static void
9950 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9951                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
9952                                 int no_rtl)
9953 {
9954   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9955   CUMULATIVE_ARGS local_cum;
9956   int gr_saved, vr_saved;
9957
9958   /* The caller has advanced CUM up to, but not beyond, the last named
9959      argument.  Advance a local copy of CUM past the last "real" named
9960      argument, to find out how many registers are left over.  */
9961   local_cum = *cum;
9962   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
9963
9964   /* Found out how many registers we need to save.  */
9965   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
9966   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
9967
9968   if (!TARGET_FLOAT)
9969     {
9970       gcc_assert (local_cum.aapcs_nvrn == 0);
9971       vr_saved = 0;
9972     }
9973
9974   if (!no_rtl)
9975     {
9976       if (gr_saved > 0)
9977         {
9978           rtx ptr, mem;
9979
9980           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
9981           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
9982                                - gr_saved * UNITS_PER_WORD);
9983           mem = gen_frame_mem (BLKmode, ptr);
9984           set_mem_alias_set (mem, get_varargs_alias_set ());
9985
9986           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
9987                                mem, gr_saved);
9988         }
9989       if (vr_saved > 0)
9990         {
9991           /* We can't use move_block_from_reg, because it will use
9992              the wrong mode, storing D regs only.  */
9993           machine_mode mode = TImode;
9994           int off, i;
9995
9996           /* Set OFF to the offset from virtual_incoming_args_rtx of
9997              the first vector register.  The VR save area lies below
9998              the GR one, and is aligned to 16 bytes.  */
9999           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10000                            STACK_BOUNDARY / BITS_PER_UNIT);
10001           off -= vr_saved * UNITS_PER_VREG;
10002
10003           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
10004             {
10005               rtx ptr, mem;
10006
10007               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10008               mem = gen_frame_mem (mode, ptr);
10009               set_mem_alias_set (mem, get_varargs_alias_set ());
10010               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
10011               off += UNITS_PER_VREG;
10012             }
10013         }
10014     }
10015
10016   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10017      any complication of having crtl->args.pretend_args_size changed.  */
10018   cfun->machine->frame.saved_varargs_size
10019     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10020                  STACK_BOUNDARY / BITS_PER_UNIT)
10021        + vr_saved * UNITS_PER_VREG);
10022 }
10023
10024 static void
10025 aarch64_conditional_register_usage (void)
10026 {
10027   int i;
10028   if (!TARGET_FLOAT)
10029     {
10030       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10031         {
10032           fixed_regs[i] = 1;
10033           call_used_regs[i] = 1;
10034         }
10035     }
10036 }
10037
10038 /* Walk down the type tree of TYPE counting consecutive base elements.
10039    If *MODEP is VOIDmode, then set it to the first valid floating point
10040    type.  If a non-floating point type is found, or if a floating point
10041    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10042    otherwise return the count in the sub-tree.  */
10043 static int
10044 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10045 {
10046   machine_mode mode;
10047   HOST_WIDE_INT size;
10048
10049   switch (TREE_CODE (type))
10050     {
10051     case REAL_TYPE:
10052       mode = TYPE_MODE (type);
10053       if (mode != DFmode && mode != SFmode && mode != TFmode)
10054         return -1;
10055
10056       if (*modep == VOIDmode)
10057         *modep = mode;
10058
10059       if (*modep == mode)
10060         return 1;
10061
10062       break;
10063
10064     case COMPLEX_TYPE:
10065       mode = TYPE_MODE (TREE_TYPE (type));
10066       if (mode != DFmode && mode != SFmode && mode != TFmode)
10067         return -1;
10068
10069       if (*modep == VOIDmode)
10070         *modep = mode;
10071
10072       if (*modep == mode)
10073         return 2;
10074
10075       break;
10076
10077     case VECTOR_TYPE:
10078       /* Use V2SImode and V4SImode as representatives of all 64-bit
10079          and 128-bit vector types.  */
10080       size = int_size_in_bytes (type);
10081       switch (size)
10082         {
10083         case 8:
10084           mode = V2SImode;
10085           break;
10086         case 16:
10087           mode = V4SImode;
10088           break;
10089         default:
10090           return -1;
10091         }
10092
10093       if (*modep == VOIDmode)
10094         *modep = mode;
10095
10096       /* Vector modes are considered to be opaque: two vectors are
10097          equivalent for the purposes of being homogeneous aggregates
10098          if they are the same size.  */
10099       if (*modep == mode)
10100         return 1;
10101
10102       break;
10103
10104     case ARRAY_TYPE:
10105       {
10106         int count;
10107         tree index = TYPE_DOMAIN (type);
10108
10109         /* Can't handle incomplete types nor sizes that are not
10110            fixed.  */
10111         if (!COMPLETE_TYPE_P (type)
10112             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10113           return -1;
10114
10115         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10116         if (count == -1
10117             || !index
10118             || !TYPE_MAX_VALUE (index)
10119             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10120             || !TYPE_MIN_VALUE (index)
10121             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10122             || count < 0)
10123           return -1;
10124
10125         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10126                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10127
10128         /* There must be no padding.  */
10129         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10130           return -1;
10131
10132         return count;
10133       }
10134
10135     case RECORD_TYPE:
10136       {
10137         int count = 0;
10138         int sub_count;
10139         tree field;
10140
10141         /* Can't handle incomplete types nor sizes that are not
10142            fixed.  */
10143         if (!COMPLETE_TYPE_P (type)
10144             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10145           return -1;
10146
10147         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10148           {
10149             if (TREE_CODE (field) != FIELD_DECL)
10150               continue;
10151
10152             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10153             if (sub_count < 0)
10154               return -1;
10155             count += sub_count;
10156           }
10157
10158         /* There must be no padding.  */
10159         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10160           return -1;
10161
10162         return count;
10163       }
10164
10165     case UNION_TYPE:
10166     case QUAL_UNION_TYPE:
10167       {
10168         /* These aren't very interesting except in a degenerate case.  */
10169         int count = 0;
10170         int sub_count;
10171         tree field;
10172
10173         /* Can't handle incomplete types nor sizes that are not
10174            fixed.  */
10175         if (!COMPLETE_TYPE_P (type)
10176             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10177           return -1;
10178
10179         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10180           {
10181             if (TREE_CODE (field) != FIELD_DECL)
10182               continue;
10183
10184             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10185             if (sub_count < 0)
10186               return -1;
10187             count = count > sub_count ? count : sub_count;
10188           }
10189
10190         /* There must be no padding.  */
10191         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10192           return -1;
10193
10194         return count;
10195       }
10196
10197     default:
10198       break;
10199     }
10200
10201   return -1;
10202 }
10203
10204 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10205    type as described in AAPCS64 \S 4.1.2.
10206
10207    See the comment above aarch64_composite_type_p for the notes on MODE.  */
10208
10209 static bool
10210 aarch64_short_vector_p (const_tree type,
10211                         machine_mode mode)
10212 {
10213   HOST_WIDE_INT size = -1;
10214
10215   if (type && TREE_CODE (type) == VECTOR_TYPE)
10216     size = int_size_in_bytes (type);
10217   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10218             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10219     size = GET_MODE_SIZE (mode);
10220
10221   return (size == 8 || size == 16);
10222 }
10223
10224 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10225    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
10226    array types.  The C99 floating-point complex types are also considered
10227    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
10228    types, which are GCC extensions and out of the scope of AAPCS64, are
10229    treated as composite types here as well.
10230
10231    Note that MODE itself is not sufficient in determining whether a type
10232    is such a composite type or not.  This is because
10233    stor-layout.c:compute_record_mode may have already changed the MODE
10234    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
10235    structure with only one field may have its MODE set to the mode of the
10236    field.  Also an integer mode whose size matches the size of the
10237    RECORD_TYPE type may be used to substitute the original mode
10238    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
10239    solely relied on.  */
10240
10241 static bool
10242 aarch64_composite_type_p (const_tree type,
10243                           machine_mode mode)
10244 {
10245   if (aarch64_short_vector_p (type, mode))
10246     return false;
10247
10248   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10249     return true;
10250
10251   if (mode == BLKmode
10252       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10253       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10254     return true;
10255
10256   return false;
10257 }
10258
10259 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10260    shall be passed or returned in simd/fp register(s) (providing these
10261    parameter passing registers are available).
10262
10263    Upon successful return, *COUNT returns the number of needed registers,
10264    *BASE_MODE returns the mode of the individual register and when IS_HAF
10265    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10266    floating-point aggregate or a homogeneous short-vector aggregate.  */
10267
10268 static bool
10269 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10270                                          const_tree type,
10271                                          machine_mode *base_mode,
10272                                          int *count,
10273                                          bool *is_ha)
10274 {
10275   machine_mode new_mode = VOIDmode;
10276   bool composite_p = aarch64_composite_type_p (type, mode);
10277
10278   if (is_ha != NULL) *is_ha = false;
10279
10280   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10281       || aarch64_short_vector_p (type, mode))
10282     {
10283       *count = 1;
10284       new_mode = mode;
10285     }
10286   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10287     {
10288       if (is_ha != NULL) *is_ha = true;
10289       *count = 2;
10290       new_mode = GET_MODE_INNER (mode);
10291     }
10292   else if (type && composite_p)
10293     {
10294       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10295
10296       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10297         {
10298           if (is_ha != NULL) *is_ha = true;
10299           *count = ag_count;
10300         }
10301       else
10302         return false;
10303     }
10304   else
10305     return false;
10306
10307   *base_mode = new_mode;
10308   return true;
10309 }
10310
10311 /* Implement TARGET_STRUCT_VALUE_RTX.  */
10312
10313 static rtx
10314 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10315                           int incoming ATTRIBUTE_UNUSED)
10316 {
10317   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10318 }
10319
10320 /* Implements target hook vector_mode_supported_p.  */
10321 static bool
10322 aarch64_vector_mode_supported_p (machine_mode mode)
10323 {
10324   if (TARGET_SIMD
10325       && (mode == V4SImode  || mode == V8HImode
10326           || mode == V16QImode || mode == V2DImode
10327           || mode == V2SImode  || mode == V4HImode
10328           || mode == V8QImode || mode == V2SFmode
10329           || mode == V4SFmode || mode == V2DFmode
10330           || mode == V4HFmode || mode == V8HFmode
10331           || mode == V1DFmode))
10332     return true;
10333
10334   return false;
10335 }
10336
10337 /* Return appropriate SIMD container
10338    for MODE within a vector of WIDTH bits.  */
10339 static machine_mode
10340 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10341 {
10342   gcc_assert (width == 64 || width == 128);
10343   if (TARGET_SIMD)
10344     {
10345       if (width == 128)
10346         switch (mode)
10347           {
10348           case DFmode:
10349             return V2DFmode;
10350           case SFmode:
10351             return V4SFmode;
10352           case SImode:
10353             return V4SImode;
10354           case HImode:
10355             return V8HImode;
10356           case QImode:
10357             return V16QImode;
10358           case DImode:
10359             return V2DImode;
10360           default:
10361             break;
10362           }
10363       else
10364         switch (mode)
10365           {
10366           case SFmode:
10367             return V2SFmode;
10368           case SImode:
10369             return V2SImode;
10370           case HImode:
10371             return V4HImode;
10372           case QImode:
10373             return V8QImode;
10374           default:
10375             break;
10376           }
10377     }
10378   return word_mode;
10379 }
10380
10381 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
10382 static machine_mode
10383 aarch64_preferred_simd_mode (machine_mode mode)
10384 {
10385   return aarch64_simd_container_mode (mode, 128);
10386 }
10387
10388 /* Return the bitmask of possible vector sizes for the vectorizer
10389    to iterate over.  */
10390 static unsigned int
10391 aarch64_autovectorize_vector_sizes (void)
10392 {
10393   return (16 | 8);
10394 }
10395
10396 /* Implement TARGET_MANGLE_TYPE.  */
10397
10398 static const char *
10399 aarch64_mangle_type (const_tree type)
10400 {
10401   /* The AArch64 ABI documents say that "__va_list" has to be
10402      managled as if it is in the "std" namespace.  */
10403   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10404     return "St9__va_list";
10405
10406   /* Half-precision float.  */
10407   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10408     return "Dh";
10409
10410   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
10411      builtin types.  */
10412   if (TYPE_NAME (type) != NULL)
10413     return aarch64_mangle_builtin_type (type);
10414
10415   /* Use the default mangling.  */
10416   return NULL;
10417 }
10418
10419
10420 /* Return true if the rtx_insn contains a MEM RTX somewhere
10421    in it.  */
10422
10423 static bool
10424 has_memory_op (rtx_insn *mem_insn)
10425 {
10426   subrtx_iterator::array_type array;
10427   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10428     if (MEM_P (*iter))
10429       return true;
10430
10431   return false;
10432 }
10433
10434 /* Find the first rtx_insn before insn that will generate an assembly
10435    instruction.  */
10436
10437 static rtx_insn *
10438 aarch64_prev_real_insn (rtx_insn *insn)
10439 {
10440   if (!insn)
10441     return NULL;
10442
10443   do
10444     {
10445       insn = prev_real_insn (insn);
10446     }
10447   while (insn && recog_memoized (insn) < 0);
10448
10449   return insn;
10450 }
10451
10452 static bool
10453 is_madd_op (enum attr_type t1)
10454 {
10455   unsigned int i;
10456   /* A number of these may be AArch32 only.  */
10457   enum attr_type mlatypes[] = {
10458     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10459     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10460     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10461   };
10462
10463   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10464     {
10465       if (t1 == mlatypes[i])
10466         return true;
10467     }
10468
10469   return false;
10470 }
10471
10472 /* Check if there is a register dependency between a load and the insn
10473    for which we hold recog_data.  */
10474
10475 static bool
10476 dep_between_memop_and_curr (rtx memop)
10477 {
10478   rtx load_reg;
10479   int opno;
10480
10481   gcc_assert (GET_CODE (memop) == SET);
10482
10483   if (!REG_P (SET_DEST (memop)))
10484     return false;
10485
10486   load_reg = SET_DEST (memop);
10487   for (opno = 1; opno < recog_data.n_operands; opno++)
10488     {
10489       rtx operand = recog_data.operand[opno];
10490       if (REG_P (operand)
10491           && reg_overlap_mentioned_p (load_reg, operand))
10492         return true;
10493
10494     }
10495   return false;
10496 }
10497
10498
10499 /* When working around the Cortex-A53 erratum 835769,
10500    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10501    instruction and has a preceding memory instruction such that a NOP
10502    should be inserted between them.  */
10503
10504 bool
10505 aarch64_madd_needs_nop (rtx_insn* insn)
10506 {
10507   enum attr_type attr_type;
10508   rtx_insn *prev;
10509   rtx body;
10510
10511   if (!TARGET_FIX_ERR_A53_835769)
10512     return false;
10513
10514   if (!INSN_P (insn) || recog_memoized (insn) < 0)
10515     return false;
10516
10517   attr_type = get_attr_type (insn);
10518   if (!is_madd_op (attr_type))
10519     return false;
10520
10521   prev = aarch64_prev_real_insn (insn);
10522   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10523      Restore recog state to INSN to avoid state corruption.  */
10524   extract_constrain_insn_cached (insn);
10525
10526   if (!prev || !has_memory_op (prev))
10527     return false;
10528
10529   body = single_set (prev);
10530
10531   /* If the previous insn is a memory op and there is no dependency between
10532      it and the DImode madd, emit a NOP between them.  If body is NULL then we
10533      have a complex memory operation, probably a load/store pair.
10534      Be conservative for now and emit a NOP.  */
10535   if (GET_MODE (recog_data.operand[0]) == DImode
10536       && (!body || !dep_between_memop_and_curr (body)))
10537     return true;
10538
10539   return false;
10540
10541 }
10542
10543
10544 /* Implement FINAL_PRESCAN_INSN.  */
10545
10546 void
10547 aarch64_final_prescan_insn (rtx_insn *insn)
10548 {
10549   if (aarch64_madd_needs_nop (insn))
10550     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10551 }
10552
10553
10554 /* Return the equivalent letter for size.  */
10555 static char
10556 sizetochar (int size)
10557 {
10558   switch (size)
10559     {
10560     case 64: return 'd';
10561     case 32: return 's';
10562     case 16: return 'h';
10563     case 8 : return 'b';
10564     default: gcc_unreachable ();
10565     }
10566 }
10567
10568 /* Return true iff x is a uniform vector of floating-point
10569    constants, and the constant can be represented in
10570    quarter-precision form.  Note, as aarch64_float_const_representable
10571    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
10572 static bool
10573 aarch64_vect_float_const_representable_p (rtx x)
10574 {
10575   rtx elt;
10576   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10577           && const_vec_duplicate_p (x, &elt)
10578           && aarch64_float_const_representable_p (elt));
10579 }
10580
10581 /* Return true for valid and false for invalid.  */
10582 bool
10583 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10584                               struct simd_immediate_info *info)
10585 {
10586 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
10587   matches = 1;                                          \
10588   for (i = 0; i < idx; i += (STRIDE))                   \
10589     if (!(TEST))                                        \
10590       matches = 0;                                      \
10591   if (matches)                                          \
10592     {                                                   \
10593       immtype = (CLASS);                                \
10594       elsize = (ELSIZE);                                \
10595       eshift = (SHIFT);                                 \
10596       emvn = (NEG);                                     \
10597       break;                                            \
10598     }
10599
10600   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
10601   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
10602   unsigned char bytes[16];
10603   int immtype = -1, matches;
10604   unsigned int invmask = inverse ? 0xff : 0;
10605   int eshift, emvn;
10606
10607   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10608     {
10609       if (! (aarch64_simd_imm_zero_p (op, mode)
10610              || aarch64_vect_float_const_representable_p (op)))
10611         return false;
10612
10613       if (info)
10614         {
10615           info->value = CONST_VECTOR_ELT (op, 0);
10616           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
10617           info->mvn = false;
10618           info->shift = 0;
10619         }
10620
10621       return true;
10622     }
10623
10624   /* Splat vector constant out into a byte vector.  */
10625   for (i = 0; i < n_elts; i++)
10626     {
10627       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
10628          it must be laid out in the vector register in reverse order.  */
10629       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
10630       unsigned HOST_WIDE_INT elpart;
10631
10632       gcc_assert (CONST_INT_P (el));
10633       elpart = INTVAL (el);
10634
10635       for (unsigned int byte = 0; byte < innersize; byte++)
10636         {
10637           bytes[idx++] = (elpart & 0xff) ^ invmask;
10638           elpart >>= BITS_PER_UNIT;
10639         }
10640
10641     }
10642
10643   /* Sanity check.  */
10644   gcc_assert (idx == GET_MODE_SIZE (mode));
10645
10646   do
10647     {
10648       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
10649              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
10650
10651       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10652              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10653
10654       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
10655              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10656
10657       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
10658              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
10659
10660       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
10661
10662       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
10663
10664       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
10665              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
10666
10667       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10668              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10669
10670       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
10671              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10672
10673       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
10674              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
10675
10676       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
10677
10678       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
10679
10680       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10681              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10682
10683       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10684              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10685
10686       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10687              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10688
10689       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10690              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10691
10692       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10693
10694       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10695              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10696     }
10697   while (0);
10698
10699   if (immtype == -1)
10700     return false;
10701
10702   if (info)
10703     {
10704       info->element_width = elsize;
10705       info->mvn = emvn != 0;
10706       info->shift = eshift;
10707
10708       unsigned HOST_WIDE_INT imm = 0;
10709
10710       if (immtype >= 12 && immtype <= 15)
10711         info->msl = true;
10712
10713       /* Un-invert bytes of recognized vector, if necessary.  */
10714       if (invmask != 0)
10715         for (i = 0; i < idx; i++)
10716           bytes[i] ^= invmask;
10717
10718       if (immtype == 17)
10719         {
10720           /* FIXME: Broken on 32-bit H_W_I hosts.  */
10721           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10722
10723           for (i = 0; i < 8; i++)
10724             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10725               << (i * BITS_PER_UNIT);
10726
10727
10728           info->value = GEN_INT (imm);
10729         }
10730       else
10731         {
10732           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10733             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10734
10735           /* Construct 'abcdefgh' because the assembler cannot handle
10736              generic constants.  */
10737           if (info->mvn)
10738             imm = ~imm;
10739           imm = (imm >> info->shift) & 0xff;
10740           info->value = GEN_INT (imm);
10741         }
10742     }
10743
10744   return true;
10745 #undef CHECK
10746 }
10747
10748 /* Check of immediate shift constants are within range.  */
10749 bool
10750 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10751 {
10752   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10753   if (left)
10754     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10755   else
10756     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10757 }
10758
10759 /* Return true if X is a uniform vector where all elements
10760    are either the floating-point constant 0.0 or the
10761    integer constant 0.  */
10762 bool
10763 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10764 {
10765   return x == CONST0_RTX (mode);
10766 }
10767
10768
10769 /* Return the bitmask CONST_INT to select the bits required by a zero extract
10770    operation of width WIDTH at bit position POS.  */
10771
10772 rtx
10773 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
10774 {
10775   gcc_assert (CONST_INT_P (width));
10776   gcc_assert (CONST_INT_P (pos));
10777
10778   unsigned HOST_WIDE_INT mask
10779     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
10780   return GEN_INT (mask << UINTVAL (pos));
10781 }
10782
10783 bool
10784 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10785 {
10786   HOST_WIDE_INT imm = INTVAL (x);
10787   int i;
10788
10789   for (i = 0; i < 8; i++)
10790     {
10791       unsigned int byte = imm & 0xff;
10792       if (byte != 0xff && byte != 0)
10793        return false;
10794       imm >>= 8;
10795     }
10796
10797   return true;
10798 }
10799
10800 bool
10801 aarch64_mov_operand_p (rtx x, machine_mode mode)
10802 {
10803   if (GET_CODE (x) == HIGH
10804       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10805     return true;
10806
10807   if (CONST_INT_P (x))
10808     return true;
10809
10810   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10811     return true;
10812
10813   return aarch64_classify_symbolic_expression (x)
10814     == SYMBOL_TINY_ABSOLUTE;
10815 }
10816
10817 /* Return a const_int vector of VAL.  */
10818 rtx
10819 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10820 {
10821   int nunits = GET_MODE_NUNITS (mode);
10822   rtvec v = rtvec_alloc (nunits);
10823   int i;
10824
10825   for (i=0; i < nunits; i++)
10826     RTVEC_ELT (v, i) = GEN_INT (val);
10827
10828   return gen_rtx_CONST_VECTOR (mode, v);
10829 }
10830
10831 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
10832
10833 bool
10834 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10835 {
10836   machine_mode vmode;
10837
10838   gcc_assert (!VECTOR_MODE_P (mode));
10839   vmode = aarch64_preferred_simd_mode (mode);
10840   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10841   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10842 }
10843
10844 /* Construct and return a PARALLEL RTX vector with elements numbering the
10845    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10846    the vector - from the perspective of the architecture.  This does not
10847    line up with GCC's perspective on lane numbers, so we end up with
10848    different masks depending on our target endian-ness.  The diagram
10849    below may help.  We must draw the distinction when building masks
10850    which select one half of the vector.  An instruction selecting
10851    architectural low-lanes for a big-endian target, must be described using
10852    a mask selecting GCC high-lanes.
10853
10854                  Big-Endian             Little-Endian
10855
10856 GCC             0   1   2   3           3   2   1   0
10857               | x | x | x | x |       | x | x | x | x |
10858 Architecture    3   2   1   0           3   2   1   0
10859
10860 Low Mask:         { 2, 3 }                { 0, 1 }
10861 High Mask:        { 0, 1 }                { 2, 3 }
10862 */
10863
10864 rtx
10865 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
10866 {
10867   int nunits = GET_MODE_NUNITS (mode);
10868   rtvec v = rtvec_alloc (nunits / 2);
10869   int high_base = nunits / 2;
10870   int low_base = 0;
10871   int base;
10872   rtx t1;
10873   int i;
10874
10875   if (BYTES_BIG_ENDIAN)
10876     base = high ? low_base : high_base;
10877   else
10878     base = high ? high_base : low_base;
10879
10880   for (i = 0; i < nunits / 2; i++)
10881     RTVEC_ELT (v, i) = GEN_INT (base + i);
10882
10883   t1 = gen_rtx_PARALLEL (mode, v);
10884   return t1;
10885 }
10886
10887 /* Check OP for validity as a PARALLEL RTX vector with elements
10888    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
10889    from the perspective of the architecture.  See the diagram above
10890    aarch64_simd_vect_par_cnst_half for more details.  */
10891
10892 bool
10893 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
10894                                        bool high)
10895 {
10896   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
10897   HOST_WIDE_INT count_op = XVECLEN (op, 0);
10898   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
10899   int i = 0;
10900
10901   if (!VECTOR_MODE_P (mode))
10902     return false;
10903
10904   if (count_op != count_ideal)
10905     return false;
10906
10907   for (i = 0; i < count_ideal; i++)
10908     {
10909       rtx elt_op = XVECEXP (op, 0, i);
10910       rtx elt_ideal = XVECEXP (ideal, 0, i);
10911
10912       if (!CONST_INT_P (elt_op)
10913           || INTVAL (elt_ideal) != INTVAL (elt_op))
10914         return false;
10915     }
10916   return true;
10917 }
10918
10919 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
10920    HIGH (exclusive).  */
10921 void
10922 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
10923                           const_tree exp)
10924 {
10925   HOST_WIDE_INT lane;
10926   gcc_assert (CONST_INT_P (operand));
10927   lane = INTVAL (operand);
10928
10929   if (lane < low || lane >= high)
10930   {
10931     if (exp)
10932       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
10933     else
10934       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
10935   }
10936 }
10937
10938 /* Return TRUE if OP is a valid vector addressing mode.  */
10939 bool
10940 aarch64_simd_mem_operand_p (rtx op)
10941 {
10942   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
10943                         || REG_P (XEXP (op, 0)));
10944 }
10945
10946 /* Emit a register copy from operand to operand, taking care not to
10947    early-clobber source registers in the process.
10948
10949    COUNT is the number of components into which the copy needs to be
10950    decomposed.  */
10951 void
10952 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
10953                                 unsigned int count)
10954 {
10955   unsigned int i;
10956   int rdest = REGNO (operands[0]);
10957   int rsrc = REGNO (operands[1]);
10958
10959   if (!reg_overlap_mentioned_p (operands[0], operands[1])
10960       || rdest < rsrc)
10961     for (i = 0; i < count; i++)
10962       emit_move_insn (gen_rtx_REG (mode, rdest + i),
10963                       gen_rtx_REG (mode, rsrc + i));
10964   else
10965     for (i = 0; i < count; i++)
10966       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
10967                       gen_rtx_REG (mode, rsrc + count - i - 1));
10968 }
10969
10970 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
10971    one of VSTRUCT modes: OI, CI or XI.  */
10972 int
10973 aarch64_simd_attr_length_move (rtx_insn *insn)
10974 {
10975   machine_mode mode;
10976
10977   extract_insn_cached (insn);
10978
10979   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
10980     {
10981       mode = GET_MODE (recog_data.operand[0]);
10982       switch (mode)
10983         {
10984         case OImode:
10985           return 8;
10986         case CImode:
10987           return 12;
10988         case XImode:
10989           return 16;
10990         default:
10991           gcc_unreachable ();
10992         }
10993     }
10994   return 4;
10995 }
10996
10997 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
10998    one of VSTRUCT modes: OI, CI, or XI.  */
10999 int
11000 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11001 {
11002   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11003 }
11004
11005 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11006    alignment of a vector to 128 bits.  */
11007 static HOST_WIDE_INT
11008 aarch64_simd_vector_alignment (const_tree type)
11009 {
11010   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11011   return MIN (align, 128);
11012 }
11013
11014 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11015 static bool
11016 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11017 {
11018   if (is_packed)
11019     return false;
11020
11021   /* We guarantee alignment for vectors up to 128-bits.  */
11022   if (tree_int_cst_compare (TYPE_SIZE (type),
11023                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11024     return false;
11025
11026   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11027   return true;
11028 }
11029
11030 /* If VALS is a vector constant that can be loaded into a register
11031    using DUP, generate instructions to do so and return an RTX to
11032    assign to the register.  Otherwise return NULL_RTX.  */
11033 static rtx
11034 aarch64_simd_dup_constant (rtx vals)
11035 {
11036   machine_mode mode = GET_MODE (vals);
11037   machine_mode inner_mode = GET_MODE_INNER (mode);
11038   rtx x;
11039
11040   if (!const_vec_duplicate_p (vals, &x))
11041     return NULL_RTX;
11042
11043   /* We can load this constant by using DUP and a constant in a
11044      single ARM register.  This will be cheaper than a vector
11045      load.  */
11046   x = copy_to_mode_reg (inner_mode, x);
11047   return gen_rtx_VEC_DUPLICATE (mode, x);
11048 }
11049
11050
11051 /* Generate code to load VALS, which is a PARALLEL containing only
11052    constants (for vec_init) or CONST_VECTOR, efficiently into a
11053    register.  Returns an RTX to copy into the register, or NULL_RTX
11054    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11055 static rtx
11056 aarch64_simd_make_constant (rtx vals)
11057 {
11058   machine_mode mode = GET_MODE (vals);
11059   rtx const_dup;
11060   rtx const_vec = NULL_RTX;
11061   int n_elts = GET_MODE_NUNITS (mode);
11062   int n_const = 0;
11063   int i;
11064
11065   if (GET_CODE (vals) == CONST_VECTOR)
11066     const_vec = vals;
11067   else if (GET_CODE (vals) == PARALLEL)
11068     {
11069       /* A CONST_VECTOR must contain only CONST_INTs and
11070          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11071          Only store valid constants in a CONST_VECTOR.  */
11072       for (i = 0; i < n_elts; ++i)
11073         {
11074           rtx x = XVECEXP (vals, 0, i);
11075           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11076             n_const++;
11077         }
11078       if (n_const == n_elts)
11079         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11080     }
11081   else
11082     gcc_unreachable ();
11083
11084   if (const_vec != NULL_RTX
11085       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11086     /* Load using MOVI/MVNI.  */
11087     return const_vec;
11088   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11089     /* Loaded using DUP.  */
11090     return const_dup;
11091   else if (const_vec != NULL_RTX)
11092     /* Load from constant pool. We can not take advantage of single-cycle
11093        LD1 because we need a PC-relative addressing mode.  */
11094     return const_vec;
11095   else
11096     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11097        We can not construct an initializer.  */
11098     return NULL_RTX;
11099 }
11100
11101 void
11102 aarch64_expand_vector_init (rtx target, rtx vals)
11103 {
11104   machine_mode mode = GET_MODE (target);
11105   machine_mode inner_mode = GET_MODE_INNER (mode);
11106   int n_elts = GET_MODE_NUNITS (mode);
11107   int n_var = 0;
11108   rtx any_const = NULL_RTX;
11109   bool all_same = true;
11110
11111   for (int i = 0; i < n_elts; ++i)
11112     {
11113       rtx x = XVECEXP (vals, 0, i);
11114       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
11115         ++n_var;
11116       else
11117         any_const = x;
11118
11119       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
11120         all_same = false;
11121     }
11122
11123   if (n_var == 0)
11124     {
11125       rtx constant = aarch64_simd_make_constant (vals);
11126       if (constant != NULL_RTX)
11127         {
11128           emit_move_insn (target, constant);
11129           return;
11130         }
11131     }
11132
11133   /* Splat a single non-constant element if we can.  */
11134   if (all_same)
11135     {
11136       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
11137       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11138       return;
11139     }
11140
11141   /* Half the fields (or less) are non-constant.  Load constant then overwrite
11142      varying fields.  Hope that this is more efficient than using the stack.  */
11143   if (n_var <= n_elts/2)
11144     {
11145       rtx copy = copy_rtx (vals);
11146
11147       /* Load constant part of vector.  We really don't care what goes into the
11148          parts we will overwrite, but we're more likely to be able to load the
11149          constant efficiently if it has fewer, larger, repeating parts
11150          (see aarch64_simd_valid_immediate).  */
11151       for (int i = 0; i < n_elts; i++)
11152         {
11153           rtx x = XVECEXP (vals, 0, i);
11154           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11155             continue;
11156           rtx subst = any_const;
11157           for (int bit = n_elts / 2; bit > 0; bit /= 2)
11158             {
11159               /* Look in the copied vector, as more elements are const.  */
11160               rtx test = XVECEXP (copy, 0, i ^ bit);
11161               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11162                 {
11163                   subst = test;
11164                   break;
11165                 }
11166             }
11167           XVECEXP (copy, 0, i) = subst;
11168         }
11169       aarch64_expand_vector_init (target, copy);
11170
11171       /* Insert variables.  */
11172       enum insn_code icode = optab_handler (vec_set_optab, mode);
11173       gcc_assert (icode != CODE_FOR_nothing);
11174
11175       for (int i = 0; i < n_elts; i++)
11176         {
11177           rtx x = XVECEXP (vals, 0, i);
11178           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11179             continue;
11180           x = copy_to_mode_reg (inner_mode, x);
11181           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11182         }
11183       return;
11184     }
11185
11186   /* Construct the vector in memory one field at a time
11187      and load the whole vector.  */
11188   rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
11189   for (int i = 0; i < n_elts; i++)
11190     emit_move_insn (adjust_address_nv (mem, inner_mode,
11191                                     i * GET_MODE_SIZE (inner_mode)),
11192                     XVECEXP (vals, 0, i));
11193   emit_move_insn (target, mem);
11194
11195 }
11196
11197 static unsigned HOST_WIDE_INT
11198 aarch64_shift_truncation_mask (machine_mode mode)
11199 {
11200   return
11201     (aarch64_vector_mode_supported_p (mode)
11202      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11203 }
11204
11205 /* Select a format to encode pointers in exception handling data.  */
11206 int
11207 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11208 {
11209    int type;
11210    switch (aarch64_cmodel)
11211      {
11212      case AARCH64_CMODEL_TINY:
11213      case AARCH64_CMODEL_TINY_PIC:
11214      case AARCH64_CMODEL_SMALL:
11215      case AARCH64_CMODEL_SMALL_PIC:
11216      case AARCH64_CMODEL_SMALL_SPIC:
11217        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
11218           for everything.  */
11219        type = DW_EH_PE_sdata4;
11220        break;
11221      default:
11222        /* No assumptions here.  8-byte relocs required.  */
11223        type = DW_EH_PE_sdata8;
11224        break;
11225      }
11226    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11227 }
11228
11229 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
11230    by the function fndecl.  */
11231
11232 void
11233 aarch64_declare_function_name (FILE *stream, const char* name,
11234                                 tree fndecl)
11235 {
11236   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11237
11238   struct cl_target_option *targ_options;
11239   if (target_parts)
11240     targ_options = TREE_TARGET_OPTION (target_parts);
11241   else
11242     targ_options = TREE_TARGET_OPTION (target_option_current_node);
11243   gcc_assert (targ_options);
11244
11245   const struct processor *this_arch
11246     = aarch64_get_arch (targ_options->x_explicit_arch);
11247
11248   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11249   std::string extension
11250     = aarch64_get_extension_string_for_isa_flags (isa_flags);
11251   asm_fprintf (asm_out_file, "\t.arch %s%s\n",
11252                this_arch->name, extension.c_str ());
11253
11254   /* Print the cpu name we're tuning for in the comments, might be
11255      useful to readers of the generated asm.  */
11256
11257   const struct processor *this_tune
11258     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11259
11260   asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11261                this_tune->name);
11262
11263   /* Don't forget the type directive for ELF.  */
11264   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11265   ASM_OUTPUT_LABEL (stream, name);
11266 }
11267
11268 /* Emit load exclusive.  */
11269
11270 static void
11271 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11272                              rtx mem, rtx model_rtx)
11273 {
11274   rtx (*gen) (rtx, rtx, rtx);
11275
11276   switch (mode)
11277     {
11278     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11279     case HImode: gen = gen_aarch64_load_exclusivehi; break;
11280     case SImode: gen = gen_aarch64_load_exclusivesi; break;
11281     case DImode: gen = gen_aarch64_load_exclusivedi; break;
11282     default:
11283       gcc_unreachable ();
11284     }
11285
11286   emit_insn (gen (rval, mem, model_rtx));
11287 }
11288
11289 /* Emit store exclusive.  */
11290
11291 static void
11292 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11293                               rtx rval, rtx mem, rtx model_rtx)
11294 {
11295   rtx (*gen) (rtx, rtx, rtx, rtx);
11296
11297   switch (mode)
11298     {
11299     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11300     case HImode: gen = gen_aarch64_store_exclusivehi; break;
11301     case SImode: gen = gen_aarch64_store_exclusivesi; break;
11302     case DImode: gen = gen_aarch64_store_exclusivedi; break;
11303     default:
11304       gcc_unreachable ();
11305     }
11306
11307   emit_insn (gen (bval, rval, mem, model_rtx));
11308 }
11309
11310 /* Mark the previous jump instruction as unlikely.  */
11311
11312 static void
11313 aarch64_emit_unlikely_jump (rtx insn)
11314 {
11315   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11316
11317   insn = emit_jump_insn (insn);
11318   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
11319 }
11320
11321 /* Expand a compare and swap pattern.  */
11322
11323 void
11324 aarch64_expand_compare_and_swap (rtx operands[])
11325 {
11326   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11327   machine_mode mode, cmp_mode;
11328   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11329   int idx;
11330   gen_cas_fn gen;
11331   const gen_cas_fn split_cas[] =
11332   {
11333     gen_aarch64_compare_and_swapqi,
11334     gen_aarch64_compare_and_swaphi,
11335     gen_aarch64_compare_and_swapsi,
11336     gen_aarch64_compare_and_swapdi
11337   };
11338   const gen_cas_fn atomic_cas[] =
11339   {
11340     gen_aarch64_compare_and_swapqi_lse,
11341     gen_aarch64_compare_and_swaphi_lse,
11342     gen_aarch64_compare_and_swapsi_lse,
11343     gen_aarch64_compare_and_swapdi_lse
11344   };
11345
11346   bval = operands[0];
11347   rval = operands[1];
11348   mem = operands[2];
11349   oldval = operands[3];
11350   newval = operands[4];
11351   is_weak = operands[5];
11352   mod_s = operands[6];
11353   mod_f = operands[7];
11354   mode = GET_MODE (mem);
11355   cmp_mode = mode;
11356
11357   /* Normally the succ memory model must be stronger than fail, but in the
11358      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11359      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
11360
11361   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11362       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11363     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11364
11365   switch (mode)
11366     {
11367     case QImode:
11368     case HImode:
11369       /* For short modes, we're going to perform the comparison in SImode,
11370          so do the zero-extension now.  */
11371       cmp_mode = SImode;
11372       rval = gen_reg_rtx (SImode);
11373       oldval = convert_modes (SImode, mode, oldval, true);
11374       /* Fall through.  */
11375
11376     case SImode:
11377     case DImode:
11378       /* Force the value into a register if needed.  */
11379       if (!aarch64_plus_operand (oldval, mode))
11380         oldval = force_reg (cmp_mode, oldval);
11381       break;
11382
11383     default:
11384       gcc_unreachable ();
11385     }
11386
11387   switch (mode)
11388     {
11389     case QImode: idx = 0; break;
11390     case HImode: idx = 1; break;
11391     case SImode: idx = 2; break;
11392     case DImode: idx = 3; break;
11393     default:
11394       gcc_unreachable ();
11395     }
11396   if (TARGET_LSE)
11397     gen = atomic_cas[idx];
11398   else
11399     gen = split_cas[idx];
11400
11401   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11402
11403   if (mode == QImode || mode == HImode)
11404     emit_move_insn (operands[1], gen_lowpart (mode, rval));
11405
11406   x = gen_rtx_REG (CCmode, CC_REGNUM);
11407   x = gen_rtx_EQ (SImode, x, const0_rtx);
11408   emit_insn (gen_rtx_SET (bval, x));
11409 }
11410
11411 /* Test whether the target supports using a atomic load-operate instruction.
11412    CODE is the operation and AFTER is TRUE if the data in memory after the
11413    operation should be returned and FALSE if the data before the operation
11414    should be returned.  Returns FALSE if the operation isn't supported by the
11415    architecture.  */
11416
11417 bool
11418 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11419 {
11420   if (!TARGET_LSE)
11421     return false;
11422
11423   switch (code)
11424     {
11425     case SET:
11426     case AND:
11427     case IOR:
11428     case XOR:
11429     case MINUS:
11430     case PLUS:
11431       return true;
11432     default:
11433       return false;
11434     }
11435 }
11436
11437 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11438    sequence implementing an atomic operation.  */
11439
11440 static void
11441 aarch64_emit_post_barrier (enum memmodel model)
11442 {
11443   const enum memmodel base_model = memmodel_base (model);
11444
11445   if (is_mm_sync (model)
11446       && (base_model == MEMMODEL_ACQUIRE
11447           || base_model == MEMMODEL_ACQ_REL
11448           || base_model == MEMMODEL_SEQ_CST))
11449     {
11450       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11451     }
11452 }
11453
11454 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
11455    for the data in memory.  EXPECTED is the value expected to be in memory.
11456    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
11457    is the memory ordering to use.  */
11458
11459 void
11460 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11461                         rtx expected, rtx desired,
11462                         rtx model)
11463 {
11464   rtx (*gen) (rtx, rtx, rtx, rtx);
11465   machine_mode mode;
11466
11467   mode = GET_MODE (mem);
11468
11469   switch (mode)
11470     {
11471     case QImode: gen = gen_aarch64_atomic_casqi; break;
11472     case HImode: gen = gen_aarch64_atomic_cashi; break;
11473     case SImode: gen = gen_aarch64_atomic_cassi; break;
11474     case DImode: gen = gen_aarch64_atomic_casdi; break;
11475     default:
11476       gcc_unreachable ();
11477     }
11478
11479   /* Move the expected value into the CAS destination register.  */
11480   emit_insn (gen_rtx_SET (rval, expected));
11481
11482   /* Emit the CAS.  */
11483   emit_insn (gen (rval, mem, desired, model));
11484
11485   /* Compare the expected value with the value loaded by the CAS, to establish
11486      whether the swap was made.  */
11487   aarch64_gen_compare_reg (EQ, rval, expected);
11488 }
11489
11490 /* Split a compare and swap pattern.  */
11491
11492 void
11493 aarch64_split_compare_and_swap (rtx operands[])
11494 {
11495   rtx rval, mem, oldval, newval, scratch;
11496   machine_mode mode;
11497   bool is_weak;
11498   rtx_code_label *label1, *label2;
11499   rtx x, cond;
11500   enum memmodel model;
11501   rtx model_rtx;
11502
11503   rval = operands[0];
11504   mem = operands[1];
11505   oldval = operands[2];
11506   newval = operands[3];
11507   is_weak = (operands[4] != const0_rtx);
11508   model_rtx = operands[5];
11509   scratch = operands[7];
11510   mode = GET_MODE (mem);
11511   model = memmodel_from_int (INTVAL (model_rtx));
11512
11513   label1 = NULL;
11514   if (!is_weak)
11515     {
11516       label1 = gen_label_rtx ();
11517       emit_label (label1);
11518     }
11519   label2 = gen_label_rtx ();
11520
11521   /* The initial load can be relaxed for a __sync operation since a final
11522      barrier will be emitted to stop code hoisting.  */
11523   if (is_mm_sync (model))
11524     aarch64_emit_load_exclusive (mode, rval, mem,
11525                                  GEN_INT (MEMMODEL_RELAXED));
11526   else
11527     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11528
11529   cond = aarch64_gen_compare_reg (NE, rval, oldval);
11530   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11531   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11532                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11533   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11534
11535   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11536
11537   if (!is_weak)
11538     {
11539       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11540       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11541                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11542       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11543     }
11544   else
11545     {
11546       cond = gen_rtx_REG (CCmode, CC_REGNUM);
11547       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11548       emit_insn (gen_rtx_SET (cond, x));
11549     }
11550
11551   emit_label (label2);
11552
11553   /* Emit any final barrier needed for a __sync operation.  */
11554   if (is_mm_sync (model))
11555     aarch64_emit_post_barrier (model);
11556 }
11557
11558 /* Emit a BIC instruction.  */
11559
11560 static void
11561 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11562 {
11563   rtx shift_rtx = GEN_INT (shift);
11564   rtx (*gen) (rtx, rtx, rtx, rtx);
11565
11566   switch (mode)
11567     {
11568     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11569     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11570     default:
11571       gcc_unreachable ();
11572     }
11573
11574   emit_insn (gen (dst, s2, shift_rtx, s1));
11575 }
11576
11577 /* Emit an atomic swap.  */
11578
11579 static void
11580 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
11581                           rtx mem, rtx model)
11582 {
11583   rtx (*gen) (rtx, rtx, rtx, rtx);
11584
11585   switch (mode)
11586     {
11587     case QImode: gen = gen_aarch64_atomic_swpqi; break;
11588     case HImode: gen = gen_aarch64_atomic_swphi; break;
11589     case SImode: gen = gen_aarch64_atomic_swpsi; break;
11590     case DImode: gen = gen_aarch64_atomic_swpdi; break;
11591     default:
11592       gcc_unreachable ();
11593     }
11594
11595   emit_insn (gen (dst, mem, value, model));
11596 }
11597
11598 /* Operations supported by aarch64_emit_atomic_load_op.  */
11599
11600 enum aarch64_atomic_load_op_code
11601 {
11602   AARCH64_LDOP_PLUS,    /* A + B  */
11603   AARCH64_LDOP_XOR,     /* A ^ B  */
11604   AARCH64_LDOP_OR,      /* A | B  */
11605   AARCH64_LDOP_BIC      /* A & ~B  */
11606 };
11607
11608 /* Emit an atomic load-operate.  */
11609
11610 static void
11611 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
11612                              machine_mode mode, rtx dst, rtx src,
11613                              rtx mem, rtx model)
11614 {
11615   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
11616   const aarch64_atomic_load_op_fn plus[] =
11617   {
11618     gen_aarch64_atomic_loadaddqi,
11619     gen_aarch64_atomic_loadaddhi,
11620     gen_aarch64_atomic_loadaddsi,
11621     gen_aarch64_atomic_loadadddi
11622   };
11623   const aarch64_atomic_load_op_fn eor[] =
11624   {
11625     gen_aarch64_atomic_loadeorqi,
11626     gen_aarch64_atomic_loadeorhi,
11627     gen_aarch64_atomic_loadeorsi,
11628     gen_aarch64_atomic_loadeordi
11629   };
11630   const aarch64_atomic_load_op_fn ior[] =
11631   {
11632     gen_aarch64_atomic_loadsetqi,
11633     gen_aarch64_atomic_loadsethi,
11634     gen_aarch64_atomic_loadsetsi,
11635     gen_aarch64_atomic_loadsetdi
11636   };
11637   const aarch64_atomic_load_op_fn bic[] =
11638   {
11639     gen_aarch64_atomic_loadclrqi,
11640     gen_aarch64_atomic_loadclrhi,
11641     gen_aarch64_atomic_loadclrsi,
11642     gen_aarch64_atomic_loadclrdi
11643   };
11644   aarch64_atomic_load_op_fn gen;
11645   int idx = 0;
11646
11647   switch (mode)
11648     {
11649     case QImode: idx = 0; break;
11650     case HImode: idx = 1; break;
11651     case SImode: idx = 2; break;
11652     case DImode: idx = 3; break;
11653     default:
11654       gcc_unreachable ();
11655     }
11656
11657   switch (code)
11658     {
11659     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
11660     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
11661     case AARCH64_LDOP_OR: gen = ior[idx]; break;
11662     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
11663     default:
11664       gcc_unreachable ();
11665     }
11666
11667   emit_insn (gen (dst, mem, src, model));
11668 }
11669
11670 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
11671    location to store the data read from memory.  OUT_RESULT is the location to
11672    store the result of the operation.  MEM is the memory location to read and
11673    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
11674    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
11675    be NULL.  */
11676
11677 void
11678 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
11679                          rtx mem, rtx value, rtx model_rtx)
11680 {
11681   machine_mode mode = GET_MODE (mem);
11682   machine_mode wmode = (mode == DImode ? DImode : SImode);
11683   const bool short_mode = (mode < SImode);
11684   aarch64_atomic_load_op_code ldop_code;
11685   rtx src;
11686   rtx x;
11687
11688   if (out_data)
11689     out_data = gen_lowpart (mode, out_data);
11690
11691   if (out_result)
11692     out_result = gen_lowpart (mode, out_result);
11693
11694   /* Make sure the value is in a register, putting it into a destination
11695      register if it needs to be manipulated.  */
11696   if (!register_operand (value, mode)
11697       || code == AND || code == MINUS)
11698     {
11699       src = out_result ? out_result : out_data;
11700       emit_move_insn (src, gen_lowpart (mode, value));
11701     }
11702   else
11703     src = value;
11704   gcc_assert (register_operand (src, mode));
11705
11706   /* Preprocess the data for the operation as necessary.  If the operation is
11707      a SET then emit a swap instruction and finish.  */
11708   switch (code)
11709     {
11710     case SET:
11711       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
11712       return;
11713
11714     case MINUS:
11715       /* Negate the value and treat it as a PLUS.  */
11716       {
11717         rtx neg_src;
11718
11719         /* Resize the value if necessary.  */
11720         if (short_mode)
11721           src = gen_lowpart (wmode, src);
11722
11723         neg_src = gen_rtx_NEG (wmode, src);
11724         emit_insn (gen_rtx_SET (src, neg_src));
11725
11726         if (short_mode)
11727           src = gen_lowpart (mode, src);
11728       }
11729       /* Fall-through.  */
11730     case PLUS:
11731       ldop_code = AARCH64_LDOP_PLUS;
11732       break;
11733
11734     case IOR:
11735       ldop_code = AARCH64_LDOP_OR;
11736       break;
11737
11738     case XOR:
11739       ldop_code = AARCH64_LDOP_XOR;
11740       break;
11741
11742     case AND:
11743       {
11744         rtx not_src;
11745
11746         /* Resize the value if necessary.  */
11747         if (short_mode)
11748           src = gen_lowpart (wmode, src);
11749
11750         not_src = gen_rtx_NOT (wmode, src);
11751         emit_insn (gen_rtx_SET (src, not_src));
11752
11753         if (short_mode)
11754           src = gen_lowpart (mode, src);
11755       }
11756       ldop_code = AARCH64_LDOP_BIC;
11757       break;
11758
11759     default:
11760       /* The operation can't be done with atomic instructions.  */
11761       gcc_unreachable ();
11762     }
11763
11764   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
11765
11766   /* If necessary, calculate the data in memory after the update by redoing the
11767      operation from values in registers.  */
11768   if (!out_result)
11769     return;
11770
11771   if (short_mode)
11772     {
11773       src = gen_lowpart (wmode, src);
11774       out_data = gen_lowpart (wmode, out_data);
11775       out_result = gen_lowpart (wmode, out_result);
11776     }
11777
11778   x = NULL_RTX;
11779
11780   switch (code)
11781     {
11782     case MINUS:
11783     case PLUS:
11784       x = gen_rtx_PLUS (wmode, out_data, src);
11785       break;
11786     case IOR:
11787       x = gen_rtx_IOR (wmode, out_data, src);
11788       break;
11789     case XOR:
11790       x = gen_rtx_XOR (wmode, out_data, src);
11791       break;
11792     case AND:
11793       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
11794       return;
11795     default:
11796       gcc_unreachable ();
11797     }
11798
11799   emit_set_insn (out_result, x);
11800
11801   return;
11802 }
11803
11804 /* Split an atomic operation.  */
11805
11806 void
11807 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
11808                          rtx value, rtx model_rtx, rtx cond)
11809 {
11810   machine_mode mode = GET_MODE (mem);
11811   machine_mode wmode = (mode == DImode ? DImode : SImode);
11812   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
11813   const bool is_sync = is_mm_sync (model);
11814   rtx_code_label *label;
11815   rtx x;
11816
11817   /* Split the atomic operation into a sequence.  */
11818   label = gen_label_rtx ();
11819   emit_label (label);
11820
11821   if (new_out)
11822     new_out = gen_lowpart (wmode, new_out);
11823   if (old_out)
11824     old_out = gen_lowpart (wmode, old_out);
11825   else
11826     old_out = new_out;
11827   value = simplify_gen_subreg (wmode, value, mode, 0);
11828
11829   /* The initial load can be relaxed for a __sync operation since a final
11830      barrier will be emitted to stop code hoisting.  */
11831  if (is_sync)
11832     aarch64_emit_load_exclusive (mode, old_out, mem,
11833                                  GEN_INT (MEMMODEL_RELAXED));
11834   else
11835     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
11836
11837   switch (code)
11838     {
11839     case SET:
11840       new_out = value;
11841       break;
11842
11843     case NOT:
11844       x = gen_rtx_AND (wmode, old_out, value);
11845       emit_insn (gen_rtx_SET (new_out, x));
11846       x = gen_rtx_NOT (wmode, new_out);
11847       emit_insn (gen_rtx_SET (new_out, x));
11848       break;
11849
11850     case MINUS:
11851       if (CONST_INT_P (value))
11852         {
11853           value = GEN_INT (-INTVAL (value));
11854           code = PLUS;
11855         }
11856       /* Fall through.  */
11857
11858     default:
11859       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
11860       emit_insn (gen_rtx_SET (new_out, x));
11861       break;
11862     }
11863
11864   aarch64_emit_store_exclusive (mode, cond, mem,
11865                                 gen_lowpart (mode, new_out), model_rtx);
11866
11867   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11868   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11869                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
11870   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11871
11872   /* Emit any final barrier needed for a __sync operation.  */
11873   if (is_sync)
11874     aarch64_emit_post_barrier (model);
11875 }
11876
11877 static void
11878 aarch64_init_libfuncs (void)
11879 {
11880    /* Half-precision float operations.  The compiler handles all operations
11881      with NULL libfuncs by converting to SFmode.  */
11882
11883   /* Conversions.  */
11884   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
11885   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
11886
11887   /* Arithmetic.  */
11888   set_optab_libfunc (add_optab, HFmode, NULL);
11889   set_optab_libfunc (sdiv_optab, HFmode, NULL);
11890   set_optab_libfunc (smul_optab, HFmode, NULL);
11891   set_optab_libfunc (neg_optab, HFmode, NULL);
11892   set_optab_libfunc (sub_optab, HFmode, NULL);
11893
11894   /* Comparisons.  */
11895   set_optab_libfunc (eq_optab, HFmode, NULL);
11896   set_optab_libfunc (ne_optab, HFmode, NULL);
11897   set_optab_libfunc (lt_optab, HFmode, NULL);
11898   set_optab_libfunc (le_optab, HFmode, NULL);
11899   set_optab_libfunc (ge_optab, HFmode, NULL);
11900   set_optab_libfunc (gt_optab, HFmode, NULL);
11901   set_optab_libfunc (unord_optab, HFmode, NULL);
11902 }
11903
11904 /* Target hook for c_mode_for_suffix.  */
11905 static machine_mode
11906 aarch64_c_mode_for_suffix (char suffix)
11907 {
11908   if (suffix == 'q')
11909     return TFmode;
11910
11911   return VOIDmode;
11912 }
11913
11914 /* We can only represent floating point constants which will fit in
11915    "quarter-precision" values.  These values are characterised by
11916    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
11917    by:
11918
11919    (-1)^s * (n/16) * 2^r
11920
11921    Where:
11922      's' is the sign bit.
11923      'n' is an integer in the range 16 <= n <= 31.
11924      'r' is an integer in the range -3 <= r <= 4.  */
11925
11926 /* Return true iff X can be represented by a quarter-precision
11927    floating point immediate operand X.  Note, we cannot represent 0.0.  */
11928 bool
11929 aarch64_float_const_representable_p (rtx x)
11930 {
11931   /* This represents our current view of how many bits
11932      make up the mantissa.  */
11933   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
11934   int exponent;
11935   unsigned HOST_WIDE_INT mantissa, mask;
11936   REAL_VALUE_TYPE r, m;
11937   bool fail;
11938
11939   if (!CONST_DOUBLE_P (x))
11940     return false;
11941
11942   /* We don't support HFmode constants yet.  */
11943   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
11944     return false;
11945
11946   r = *CONST_DOUBLE_REAL_VALUE (x);
11947
11948   /* We cannot represent infinities, NaNs or +/-zero.  We won't
11949      know if we have +zero until we analyse the mantissa, but we
11950      can reject the other invalid values.  */
11951   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
11952       || REAL_VALUE_MINUS_ZERO (r))
11953     return false;
11954
11955   /* Extract exponent.  */
11956   r = real_value_abs (&r);
11957   exponent = REAL_EXP (&r);
11958
11959   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
11960      highest (sign) bit, with a fixed binary point at bit point_pos.
11961      m1 holds the low part of the mantissa, m2 the high part.
11962      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
11963      bits for the mantissa, this can fail (low bits will be lost).  */
11964   real_ldexp (&m, &r, point_pos - exponent);
11965   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
11966
11967   /* If the low part of the mantissa has bits set we cannot represent
11968      the value.  */
11969   if (w.elt (0) != 0)
11970     return false;
11971   /* We have rejected the lower HOST_WIDE_INT, so update our
11972      understanding of how many bits lie in the mantissa and
11973      look only at the high HOST_WIDE_INT.  */
11974   mantissa = w.elt (1);
11975   point_pos -= HOST_BITS_PER_WIDE_INT;
11976
11977   /* We can only represent values with a mantissa of the form 1.xxxx.  */
11978   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
11979   if ((mantissa & mask) != 0)
11980     return false;
11981
11982   /* Having filtered unrepresentable values, we may now remove all
11983      but the highest 5 bits.  */
11984   mantissa >>= point_pos - 5;
11985
11986   /* We cannot represent the value 0.0, so reject it.  This is handled
11987      elsewhere.  */
11988   if (mantissa == 0)
11989     return false;
11990
11991   /* Then, as bit 4 is always set, we can mask it off, leaving
11992      the mantissa in the range [0, 15].  */
11993   mantissa &= ~(1 << 4);
11994   gcc_assert (mantissa <= 15);
11995
11996   /* GCC internally does not use IEEE754-like encoding (where normalized
11997      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
11998      Our mantissa values are shifted 4 places to the left relative to
11999      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12000      by 5 places to correct for GCC's representation.  */
12001   exponent = 5 - exponent;
12002
12003   return (exponent >= 0 && exponent <= 7);
12004 }
12005
12006 char*
12007 aarch64_output_simd_mov_immediate (rtx const_vector,
12008                                    machine_mode mode,
12009                                    unsigned width)
12010 {
12011   bool is_valid;
12012   static char templ[40];
12013   const char *mnemonic;
12014   const char *shift_op;
12015   unsigned int lane_count = 0;
12016   char element_char;
12017
12018   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12019
12020   /* This will return true to show const_vector is legal for use as either
12021      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
12022      also update INFO to show how the immediate should be generated.  */
12023   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12024   gcc_assert (is_valid);
12025
12026   element_char = sizetochar (info.element_width);
12027   lane_count = width / info.element_width;
12028
12029   mode = GET_MODE_INNER (mode);
12030   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12031     {
12032       gcc_assert (info.shift == 0 && ! info.mvn);
12033       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12034          move immediate path.  */
12035       if (aarch64_float_const_zero_rtx_p (info.value))
12036         info.value = GEN_INT (0);
12037       else
12038         {
12039 #define buf_size 20
12040           char float_buf[buf_size] = {'\0'};
12041           real_to_decimal_for_mode (float_buf,
12042                                     CONST_DOUBLE_REAL_VALUE (info.value),
12043                                     buf_size, buf_size, 1, mode);
12044 #undef buf_size
12045
12046           if (lane_count == 1)
12047             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12048           else
12049             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12050                       lane_count, element_char, float_buf);
12051           return templ;
12052         }
12053     }
12054
12055   mnemonic = info.mvn ? "mvni" : "movi";
12056   shift_op = info.msl ? "msl" : "lsl";
12057
12058   gcc_assert (CONST_INT_P (info.value));
12059   if (lane_count == 1)
12060     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12061               mnemonic, UINTVAL (info.value));
12062   else if (info.shift)
12063     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12064               ", %s %d", mnemonic, lane_count, element_char,
12065               UINTVAL (info.value), shift_op, info.shift);
12066   else
12067     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12068               mnemonic, lane_count, element_char, UINTVAL (info.value));
12069   return templ;
12070 }
12071
12072 char*
12073 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12074                                           machine_mode mode)
12075 {
12076   machine_mode vmode;
12077
12078   gcc_assert (!VECTOR_MODE_P (mode));
12079   vmode = aarch64_simd_container_mode (mode, 64);
12080   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12081   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12082 }
12083
12084 /* Split operands into moves from op[1] + op[2] into op[0].  */
12085
12086 void
12087 aarch64_split_combinev16qi (rtx operands[3])
12088 {
12089   unsigned int dest = REGNO (operands[0]);
12090   unsigned int src1 = REGNO (operands[1]);
12091   unsigned int src2 = REGNO (operands[2]);
12092   machine_mode halfmode = GET_MODE (operands[1]);
12093   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12094   rtx destlo, desthi;
12095
12096   gcc_assert (halfmode == V16QImode);
12097
12098   if (src1 == dest && src2 == dest + halfregs)
12099     {
12100       /* No-op move.  Can't split to nothing; emit something.  */
12101       emit_note (NOTE_INSN_DELETED);
12102       return;
12103     }
12104
12105   /* Preserve register attributes for variable tracking.  */
12106   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12107   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12108                                GET_MODE_SIZE (halfmode));
12109
12110   /* Special case of reversed high/low parts.  */
12111   if (reg_overlap_mentioned_p (operands[2], destlo)
12112       && reg_overlap_mentioned_p (operands[1], desthi))
12113     {
12114       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12115       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12116       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12117     }
12118   else if (!reg_overlap_mentioned_p (operands[2], destlo))
12119     {
12120       /* Try to avoid unnecessary moves if part of the result
12121          is in the right place already.  */
12122       if (src1 != dest)
12123         emit_move_insn (destlo, operands[1]);
12124       if (src2 != dest + halfregs)
12125         emit_move_insn (desthi, operands[2]);
12126     }
12127   else
12128     {
12129       if (src2 != dest + halfregs)
12130         emit_move_insn (desthi, operands[2]);
12131       if (src1 != dest)
12132         emit_move_insn (destlo, operands[1]);
12133     }
12134 }
12135
12136 /* vec_perm support.  */
12137
12138 #define MAX_VECT_LEN 16
12139
12140 struct expand_vec_perm_d
12141 {
12142   rtx target, op0, op1;
12143   unsigned char perm[MAX_VECT_LEN];
12144   machine_mode vmode;
12145   unsigned char nelt;
12146   bool one_vector_p;
12147   bool testing_p;
12148 };
12149
12150 /* Generate a variable permutation.  */
12151
12152 static void
12153 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12154 {
12155   machine_mode vmode = GET_MODE (target);
12156   bool one_vector_p = rtx_equal_p (op0, op1);
12157
12158   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12159   gcc_checking_assert (GET_MODE (op0) == vmode);
12160   gcc_checking_assert (GET_MODE (op1) == vmode);
12161   gcc_checking_assert (GET_MODE (sel) == vmode);
12162   gcc_checking_assert (TARGET_SIMD);
12163
12164   if (one_vector_p)
12165     {
12166       if (vmode == V8QImode)
12167         {
12168           /* Expand the argument to a V16QI mode by duplicating it.  */
12169           rtx pair = gen_reg_rtx (V16QImode);
12170           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12171           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12172         }
12173       else
12174         {
12175           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12176         }
12177     }
12178   else
12179     {
12180       rtx pair;
12181
12182       if (vmode == V8QImode)
12183         {
12184           pair = gen_reg_rtx (V16QImode);
12185           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12186           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12187         }
12188       else
12189         {
12190           pair = gen_reg_rtx (OImode);
12191           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12192           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12193         }
12194     }
12195 }
12196
12197 void
12198 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12199 {
12200   machine_mode vmode = GET_MODE (target);
12201   unsigned int nelt = GET_MODE_NUNITS (vmode);
12202   bool one_vector_p = rtx_equal_p (op0, op1);
12203   rtx mask;
12204
12205   /* The TBL instruction does not use a modulo index, so we must take care
12206      of that ourselves.  */
12207   mask = aarch64_simd_gen_const_vector_dup (vmode,
12208       one_vector_p ? nelt - 1 : 2 * nelt - 1);
12209   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12210
12211   /* For big-endian, we also need to reverse the index within the vector
12212      (but not which vector).  */
12213   if (BYTES_BIG_ENDIAN)
12214     {
12215       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
12216       if (!one_vector_p)
12217         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12218       sel = expand_simple_binop (vmode, XOR, sel, mask,
12219                                  NULL, 0, OPTAB_LIB_WIDEN);
12220     }
12221   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12222 }
12223
12224 /* Recognize patterns suitable for the TRN instructions.  */
12225 static bool
12226 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12227 {
12228   unsigned int i, odd, mask, nelt = d->nelt;
12229   rtx out, in0, in1, x;
12230   rtx (*gen) (rtx, rtx, rtx);
12231   machine_mode vmode = d->vmode;
12232
12233   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12234     return false;
12235
12236   /* Note that these are little-endian tests.
12237      We correct for big-endian later.  */
12238   if (d->perm[0] == 0)
12239     odd = 0;
12240   else if (d->perm[0] == 1)
12241     odd = 1;
12242   else
12243     return false;
12244   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12245
12246   for (i = 0; i < nelt; i += 2)
12247     {
12248       if (d->perm[i] != i + odd)
12249         return false;
12250       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12251         return false;
12252     }
12253
12254   /* Success!  */
12255   if (d->testing_p)
12256     return true;
12257
12258   in0 = d->op0;
12259   in1 = d->op1;
12260   if (BYTES_BIG_ENDIAN)
12261     {
12262       x = in0, in0 = in1, in1 = x;
12263       odd = !odd;
12264     }
12265   out = d->target;
12266
12267   if (odd)
12268     {
12269       switch (vmode)
12270         {
12271         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12272         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12273         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12274         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12275         case V4SImode: gen = gen_aarch64_trn2v4si; break;
12276         case V2SImode: gen = gen_aarch64_trn2v2si; break;
12277         case V2DImode: gen = gen_aarch64_trn2v2di; break;
12278         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12279         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12280         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12281         default:
12282           return false;
12283         }
12284     }
12285   else
12286     {
12287       switch (vmode)
12288         {
12289         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12290         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12291         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12292         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12293         case V4SImode: gen = gen_aarch64_trn1v4si; break;
12294         case V2SImode: gen = gen_aarch64_trn1v2si; break;
12295         case V2DImode: gen = gen_aarch64_trn1v2di; break;
12296         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12297         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12298         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12299         default:
12300           return false;
12301         }
12302     }
12303
12304   emit_insn (gen (out, in0, in1));
12305   return true;
12306 }
12307
12308 /* Recognize patterns suitable for the UZP instructions.  */
12309 static bool
12310 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12311 {
12312   unsigned int i, odd, mask, nelt = d->nelt;
12313   rtx out, in0, in1, x;
12314   rtx (*gen) (rtx, rtx, rtx);
12315   machine_mode vmode = d->vmode;
12316
12317   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12318     return false;
12319
12320   /* Note that these are little-endian tests.
12321      We correct for big-endian later.  */
12322   if (d->perm[0] == 0)
12323     odd = 0;
12324   else if (d->perm[0] == 1)
12325     odd = 1;
12326   else
12327     return false;
12328   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12329
12330   for (i = 0; i < nelt; i++)
12331     {
12332       unsigned elt = (i * 2 + odd) & mask;
12333       if (d->perm[i] != elt)
12334         return false;
12335     }
12336
12337   /* Success!  */
12338   if (d->testing_p)
12339     return true;
12340
12341   in0 = d->op0;
12342   in1 = d->op1;
12343   if (BYTES_BIG_ENDIAN)
12344     {
12345       x = in0, in0 = in1, in1 = x;
12346       odd = !odd;
12347     }
12348   out = d->target;
12349
12350   if (odd)
12351     {
12352       switch (vmode)
12353         {
12354         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12355         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12356         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12357         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12358         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12359         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12360         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12361         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12362         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12363         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12364         default:
12365           return false;
12366         }
12367     }
12368   else
12369     {
12370       switch (vmode)
12371         {
12372         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12373         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12374         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12375         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12376         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12377         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12378         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12379         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12380         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12381         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12382         default:
12383           return false;
12384         }
12385     }
12386
12387   emit_insn (gen (out, in0, in1));
12388   return true;
12389 }
12390
12391 /* Recognize patterns suitable for the ZIP instructions.  */
12392 static bool
12393 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12394 {
12395   unsigned int i, high, mask, nelt = d->nelt;
12396   rtx out, in0, in1, x;
12397   rtx (*gen) (rtx, rtx, rtx);
12398   machine_mode vmode = d->vmode;
12399
12400   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12401     return false;
12402
12403   /* Note that these are little-endian tests.
12404      We correct for big-endian later.  */
12405   high = nelt / 2;
12406   if (d->perm[0] == high)
12407     /* Do Nothing.  */
12408     ;
12409   else if (d->perm[0] == 0)
12410     high = 0;
12411   else
12412     return false;
12413   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12414
12415   for (i = 0; i < nelt / 2; i++)
12416     {
12417       unsigned elt = (i + high) & mask;
12418       if (d->perm[i * 2] != elt)
12419         return false;
12420       elt = (elt + nelt) & mask;
12421       if (d->perm[i * 2 + 1] != elt)
12422         return false;
12423     }
12424
12425   /* Success!  */
12426   if (d->testing_p)
12427     return true;
12428
12429   in0 = d->op0;
12430   in1 = d->op1;
12431   if (BYTES_BIG_ENDIAN)
12432     {
12433       x = in0, in0 = in1, in1 = x;
12434       high = !high;
12435     }
12436   out = d->target;
12437
12438   if (high)
12439     {
12440       switch (vmode)
12441         {
12442         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12443         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12444         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12445         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12446         case V4SImode: gen = gen_aarch64_zip2v4si; break;
12447         case V2SImode: gen = gen_aarch64_zip2v2si; break;
12448         case V2DImode: gen = gen_aarch64_zip2v2di; break;
12449         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12450         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12451         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12452         default:
12453           return false;
12454         }
12455     }
12456   else
12457     {
12458       switch (vmode)
12459         {
12460         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12461         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12462         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12463         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12464         case V4SImode: gen = gen_aarch64_zip1v4si; break;
12465         case V2SImode: gen = gen_aarch64_zip1v2si; break;
12466         case V2DImode: gen = gen_aarch64_zip1v2di; break;
12467         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12468         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12469         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12470         default:
12471           return false;
12472         }
12473     }
12474
12475   emit_insn (gen (out, in0, in1));
12476   return true;
12477 }
12478
12479 /* Recognize patterns for the EXT insn.  */
12480
12481 static bool
12482 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12483 {
12484   unsigned int i, nelt = d->nelt;
12485   rtx (*gen) (rtx, rtx, rtx, rtx);
12486   rtx offset;
12487
12488   unsigned int location = d->perm[0]; /* Always < nelt.  */
12489
12490   /* Check if the extracted indices are increasing by one.  */
12491   for (i = 1; i < nelt; i++)
12492     {
12493       unsigned int required = location + i;
12494       if (d->one_vector_p)
12495         {
12496           /* We'll pass the same vector in twice, so allow indices to wrap.  */
12497           required &= (nelt - 1);
12498         }
12499       if (d->perm[i] != required)
12500         return false;
12501     }
12502
12503   switch (d->vmode)
12504     {
12505     case V16QImode: gen = gen_aarch64_extv16qi; break;
12506     case V8QImode: gen = gen_aarch64_extv8qi; break;
12507     case V4HImode: gen = gen_aarch64_extv4hi; break;
12508     case V8HImode: gen = gen_aarch64_extv8hi; break;
12509     case V2SImode: gen = gen_aarch64_extv2si; break;
12510     case V4SImode: gen = gen_aarch64_extv4si; break;
12511     case V2SFmode: gen = gen_aarch64_extv2sf; break;
12512     case V4SFmode: gen = gen_aarch64_extv4sf; break;
12513     case V2DImode: gen = gen_aarch64_extv2di; break;
12514     case V2DFmode: gen = gen_aarch64_extv2df; break;
12515     default:
12516       return false;
12517     }
12518
12519   /* Success! */
12520   if (d->testing_p)
12521     return true;
12522
12523   /* The case where (location == 0) is a no-op for both big- and little-endian,
12524      and is removed by the mid-end at optimization levels -O1 and higher.  */
12525
12526   if (BYTES_BIG_ENDIAN && (location != 0))
12527     {
12528       /* After setup, we want the high elements of the first vector (stored
12529          at the LSB end of the register), and the low elements of the second
12530          vector (stored at the MSB end of the register). So swap.  */
12531       std::swap (d->op0, d->op1);
12532       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
12533       location = nelt - location;
12534     }
12535
12536   offset = GEN_INT (location);
12537   emit_insn (gen (d->target, d->op0, d->op1, offset));
12538   return true;
12539 }
12540
12541 /* Recognize patterns for the REV insns.  */
12542
12543 static bool
12544 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12545 {
12546   unsigned int i, j, diff, nelt = d->nelt;
12547   rtx (*gen) (rtx, rtx);
12548
12549   if (!d->one_vector_p)
12550     return false;
12551
12552   diff = d->perm[0];
12553   switch (diff)
12554     {
12555     case 7:
12556       switch (d->vmode)
12557         {
12558         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12559         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
12560         default:
12561           return false;
12562         }
12563       break;
12564     case 3:
12565       switch (d->vmode)
12566         {
12567         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
12568         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
12569         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
12570         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
12571         default:
12572           return false;
12573         }
12574       break;
12575     case 1:
12576       switch (d->vmode)
12577         {
12578         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
12579         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
12580         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
12581         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
12582         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
12583         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
12584         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
12585         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
12586         default:
12587           return false;
12588         }
12589       break;
12590     default:
12591       return false;
12592     }
12593
12594   for (i = 0; i < nelt ; i += diff + 1)
12595     for (j = 0; j <= diff; j += 1)
12596       {
12597         /* This is guaranteed to be true as the value of diff
12598            is 7, 3, 1 and we should have enough elements in the
12599            queue to generate this.  Getting a vector mask with a
12600            value of diff other than these values implies that
12601            something is wrong by the time we get here.  */
12602         gcc_assert (i + j < nelt);
12603         if (d->perm[i + j] != i + diff - j)
12604           return false;
12605       }
12606
12607   /* Success! */
12608   if (d->testing_p)
12609     return true;
12610
12611   emit_insn (gen (d->target, d->op0));
12612   return true;
12613 }
12614
12615 static bool
12616 aarch64_evpc_dup (struct expand_vec_perm_d *d)
12617 {
12618   rtx (*gen) (rtx, rtx, rtx);
12619   rtx out = d->target;
12620   rtx in0;
12621   machine_mode vmode = d->vmode;
12622   unsigned int i, elt, nelt = d->nelt;
12623   rtx lane;
12624
12625   elt = d->perm[0];
12626   for (i = 1; i < nelt; i++)
12627     {
12628       if (elt != d->perm[i])
12629         return false;
12630     }
12631
12632   /* The generic preparation in aarch64_expand_vec_perm_const_1
12633      swaps the operand order and the permute indices if it finds
12634      d->perm[0] to be in the second operand.  Thus, we can always
12635      use d->op0 and need not do any extra arithmetic to get the
12636      correct lane number.  */
12637   in0 = d->op0;
12638   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
12639
12640   switch (vmode)
12641     {
12642     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
12643     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
12644     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
12645     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
12646     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
12647     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
12648     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
12649     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
12650     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
12651     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
12652     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
12653     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
12654     default:
12655       return false;
12656     }
12657
12658   emit_insn (gen (out, in0, lane));
12659   return true;
12660 }
12661
12662 static bool
12663 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
12664 {
12665   rtx rperm[MAX_VECT_LEN], sel;
12666   machine_mode vmode = d->vmode;
12667   unsigned int i, nelt = d->nelt;
12668
12669   if (d->testing_p)
12670     return true;
12671
12672   /* Generic code will try constant permutation twice.  Once with the
12673      original mode and again with the elements lowered to QImode.
12674      So wait and don't do the selector expansion ourselves.  */
12675   if (vmode != V8QImode && vmode != V16QImode)
12676     return false;
12677
12678   for (i = 0; i < nelt; ++i)
12679     {
12680       int nunits = GET_MODE_NUNITS (vmode);
12681
12682       /* If big-endian and two vectors we end up with a weird mixed-endian
12683          mode on NEON.  Reverse the index within each word but not the word
12684          itself.  */
12685       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
12686                                            : d->perm[i]);
12687     }
12688   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
12689   sel = force_reg (vmode, sel);
12690
12691   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
12692   return true;
12693 }
12694
12695 static bool
12696 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
12697 {
12698   /* The pattern matching functions above are written to look for a small
12699      number to begin the sequence (0, 1, N/2).  If we begin with an index
12700      from the second operand, we can swap the operands.  */
12701   if (d->perm[0] >= d->nelt)
12702     {
12703       unsigned i, nelt = d->nelt;
12704
12705       gcc_assert (nelt == (nelt & -nelt));
12706       for (i = 0; i < nelt; ++i)
12707         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
12708
12709       std::swap (d->op0, d->op1);
12710     }
12711
12712   if (TARGET_SIMD)
12713     {
12714       if (aarch64_evpc_rev (d))
12715         return true;
12716       else if (aarch64_evpc_ext (d))
12717         return true;
12718       else if (aarch64_evpc_dup (d))
12719         return true;
12720       else if (aarch64_evpc_zip (d))
12721         return true;
12722       else if (aarch64_evpc_uzp (d))
12723         return true;
12724       else if (aarch64_evpc_trn (d))
12725         return true;
12726       return aarch64_evpc_tbl (d);
12727     }
12728   return false;
12729 }
12730
12731 /* Expand a vec_perm_const pattern.  */
12732
12733 bool
12734 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
12735 {
12736   struct expand_vec_perm_d d;
12737   int i, nelt, which;
12738
12739   d.target = target;
12740   d.op0 = op0;
12741   d.op1 = op1;
12742
12743   d.vmode = GET_MODE (target);
12744   gcc_assert (VECTOR_MODE_P (d.vmode));
12745   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12746   d.testing_p = false;
12747
12748   for (i = which = 0; i < nelt; ++i)
12749     {
12750       rtx e = XVECEXP (sel, 0, i);
12751       int ei = INTVAL (e) & (2 * nelt - 1);
12752       which |= (ei < nelt ? 1 : 2);
12753       d.perm[i] = ei;
12754     }
12755
12756   switch (which)
12757     {
12758     default:
12759       gcc_unreachable ();
12760
12761     case 3:
12762       d.one_vector_p = false;
12763       if (!rtx_equal_p (op0, op1))
12764         break;
12765
12766       /* The elements of PERM do not suggest that only the first operand
12767          is used, but both operands are identical.  Allow easier matching
12768          of the permutation by folding the permutation into the single
12769          input vector.  */
12770       /* Fall Through.  */
12771     case 2:
12772       for (i = 0; i < nelt; ++i)
12773         d.perm[i] &= nelt - 1;
12774       d.op0 = op1;
12775       d.one_vector_p = true;
12776       break;
12777
12778     case 1:
12779       d.op1 = op0;
12780       d.one_vector_p = true;
12781       break;
12782     }
12783
12784   return aarch64_expand_vec_perm_const_1 (&d);
12785 }
12786
12787 static bool
12788 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
12789                                      const unsigned char *sel)
12790 {
12791   struct expand_vec_perm_d d;
12792   unsigned int i, nelt, which;
12793   bool ret;
12794
12795   d.vmode = vmode;
12796   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12797   d.testing_p = true;
12798   memcpy (d.perm, sel, nelt);
12799
12800   /* Calculate whether all elements are in one vector.  */
12801   for (i = which = 0; i < nelt; ++i)
12802     {
12803       unsigned char e = d.perm[i];
12804       gcc_assert (e < 2 * nelt);
12805       which |= (e < nelt ? 1 : 2);
12806     }
12807
12808   /* If all elements are from the second vector, reindex as if from the
12809      first vector.  */
12810   if (which == 2)
12811     for (i = 0; i < nelt; ++i)
12812       d.perm[i] -= nelt;
12813
12814   /* Check whether the mask can be applied to a single vector.  */
12815   d.one_vector_p = (which != 3);
12816
12817   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
12818   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
12819   if (!d.one_vector_p)
12820     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
12821
12822   start_sequence ();
12823   ret = aarch64_expand_vec_perm_const_1 (&d);
12824   end_sequence ();
12825
12826   return ret;
12827 }
12828
12829 /* Implement target hook CANNOT_CHANGE_MODE_CLASS.  */
12830 bool
12831 aarch64_cannot_change_mode_class (machine_mode from,
12832                                   machine_mode to,
12833                                   enum reg_class rclass)
12834 {
12835   /* We cannot allow word_mode subregs of full vector modes.
12836      Otherwise the middle-end will assume it's ok to store to
12837      (subreg:DI (reg:TI 100) 0) in order to modify only the low 64 bits
12838      of the 128-bit register.  However, after reload the subreg will
12839      be dropped leaving a plain DImode store.  See PR67609 for a more
12840      detailed dicussion.  In all other cases, we want to be permissive
12841      and return false.  */
12842   return (reg_classes_intersect_p (FP_REGS, rclass)
12843           && GET_MODE_SIZE (to) == UNITS_PER_WORD
12844           && GET_MODE_SIZE (from) > UNITS_PER_WORD);
12845 }
12846
12847 rtx
12848 aarch64_reverse_mask (enum machine_mode mode)
12849 {
12850   /* We have to reverse each vector because we dont have
12851      a permuted load that can reverse-load according to ABI rules.  */
12852   rtx mask;
12853   rtvec v = rtvec_alloc (16);
12854   int i, j;
12855   int nunits = GET_MODE_NUNITS (mode);
12856   int usize = GET_MODE_UNIT_SIZE (mode);
12857
12858   gcc_assert (BYTES_BIG_ENDIAN);
12859   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
12860
12861   for (i = 0; i < nunits; i++)
12862     for (j = 0; j < usize; j++)
12863       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
12864   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
12865   return force_reg (V16QImode, mask);
12866 }
12867
12868 /* Implement MODES_TIEABLE_P.  */
12869
12870 bool
12871 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
12872 {
12873   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
12874     return true;
12875
12876   /* We specifically want to allow elements of "structure" modes to
12877      be tieable to the structure.  This more general condition allows
12878      other rarer situations too.  */
12879   if (TARGET_SIMD
12880       && aarch64_vector_mode_p (mode1)
12881       && aarch64_vector_mode_p (mode2))
12882     return true;
12883
12884   return false;
12885 }
12886
12887 /* Return a new RTX holding the result of moving POINTER forward by
12888    AMOUNT bytes.  */
12889
12890 static rtx
12891 aarch64_move_pointer (rtx pointer, int amount)
12892 {
12893   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
12894
12895   return adjust_automodify_address (pointer, GET_MODE (pointer),
12896                                     next, amount);
12897 }
12898
12899 /* Return a new RTX holding the result of moving POINTER forward by the
12900    size of the mode it points to.  */
12901
12902 static rtx
12903 aarch64_progress_pointer (rtx pointer)
12904 {
12905   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
12906
12907   return aarch64_move_pointer (pointer, amount);
12908 }
12909
12910 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
12911    MODE bytes.  */
12912
12913 static void
12914 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
12915                                               machine_mode mode)
12916 {
12917   rtx reg = gen_reg_rtx (mode);
12918
12919   /* "Cast" the pointers to the correct mode.  */
12920   *src = adjust_address (*src, mode, 0);
12921   *dst = adjust_address (*dst, mode, 0);
12922   /* Emit the memcpy.  */
12923   emit_move_insn (reg, *src);
12924   emit_move_insn (*dst, reg);
12925   /* Move the pointers forward.  */
12926   *src = aarch64_progress_pointer (*src);
12927   *dst = aarch64_progress_pointer (*dst);
12928 }
12929
12930 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
12931    we succeed, otherwise return false.  */
12932
12933 bool
12934 aarch64_expand_movmem (rtx *operands)
12935 {
12936   unsigned int n;
12937   rtx dst = operands[0];
12938   rtx src = operands[1];
12939   rtx base;
12940   bool speed_p = !optimize_function_for_size_p (cfun);
12941
12942   /* When optimizing for size, give a better estimate of the length of a
12943      memcpy call, but use the default otherwise.  */
12944   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
12945
12946   /* We can't do anything smart if the amount to copy is not constant.  */
12947   if (!CONST_INT_P (operands[2]))
12948     return false;
12949
12950   n = UINTVAL (operands[2]);
12951
12952   /* Try to keep the number of instructions low.  For cases below 16 bytes we
12953      need to make at most two moves.  For cases above 16 bytes it will be one
12954      move for each 16 byte chunk, then at most two additional moves.  */
12955   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
12956     return false;
12957
12958   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
12959   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
12960
12961   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
12962   src = adjust_automodify_address (src, VOIDmode, base, 0);
12963
12964   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
12965      1-byte chunk.  */
12966   if (n < 4)
12967     {
12968       if (n >= 2)
12969         {
12970           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12971           n -= 2;
12972         }
12973
12974       if (n == 1)
12975         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
12976
12977       return true;
12978     }
12979
12980   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
12981      4-byte chunk, partially overlapping with the previously copied chunk.  */
12982   if (n < 8)
12983     {
12984       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12985       n -= 4;
12986       if (n > 0)
12987         {
12988           int move = n - 4;
12989
12990           src = aarch64_move_pointer (src, move);
12991           dst = aarch64_move_pointer (dst, move);
12992           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12993         }
12994       return true;
12995     }
12996
12997   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
12998      them, then (if applicable) an 8-byte chunk.  */
12999   while (n >= 8)
13000     {
13001       if (n / 16)
13002         {
13003           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13004           n -= 16;
13005         }
13006       else
13007         {
13008           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13009           n -= 8;
13010         }
13011     }
13012
13013   /* Finish the final bytes of the copy.  We can always do this in one
13014      instruction.  We either copy the exact amount we need, or partially
13015      overlap with the previous chunk we copied and copy 8-bytes.  */
13016   if (n == 0)
13017     return true;
13018   else if (n == 1)
13019     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13020   else if (n == 2)
13021     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13022   else if (n == 4)
13023     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13024   else
13025     {
13026       if (n == 3)
13027         {
13028           src = aarch64_move_pointer (src, -1);
13029           dst = aarch64_move_pointer (dst, -1);
13030           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13031         }
13032       else
13033         {
13034           int move = n - 8;
13035
13036           src = aarch64_move_pointer (src, move);
13037           dst = aarch64_move_pointer (dst, move);
13038           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13039         }
13040     }
13041
13042   return true;
13043 }
13044
13045 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
13046
13047 static unsigned HOST_WIDE_INT
13048 aarch64_asan_shadow_offset (void)
13049 {
13050   return (HOST_WIDE_INT_1 << 36);
13051 }
13052
13053 static bool
13054 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13055                                         unsigned int align,
13056                                         enum by_pieces_operation op,
13057                                         bool speed_p)
13058 {
13059   /* STORE_BY_PIECES can be used when copying a constant string, but
13060      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13061      For now we always fail this and let the move_by_pieces code copy
13062      the string from read-only memory.  */
13063   if (op == STORE_BY_PIECES)
13064     return false;
13065
13066   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13067 }
13068
13069 static enum machine_mode
13070 aarch64_code_to_ccmode (enum rtx_code code)
13071 {
13072   switch (code)
13073     {
13074     case NE:
13075       return CC_DNEmode;
13076
13077     case EQ:
13078       return CC_DEQmode;
13079
13080     case LE:
13081       return CC_DLEmode;
13082
13083     case LT:
13084       return CC_DLTmode;
13085
13086     case GE:
13087       return CC_DGEmode;
13088
13089     case GT:
13090       return CC_DGTmode;
13091
13092     case LEU:
13093       return CC_DLEUmode;
13094
13095     case LTU:
13096       return CC_DLTUmode;
13097
13098     case GEU:
13099       return CC_DGEUmode;
13100
13101     case GTU:
13102       return CC_DGTUmode;
13103
13104     default:
13105       return CCmode;
13106     }
13107 }
13108
13109 static rtx
13110 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
13111                         int code, tree treeop0, tree treeop1)
13112 {
13113   enum machine_mode op_mode, cmp_mode, cc_mode;
13114   rtx op0, op1, cmp, target;
13115   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13116   enum insn_code icode;
13117   struct expand_operand ops[4];
13118
13119   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
13120   if (cc_mode == CCmode)
13121     return NULL_RTX;
13122
13123   start_sequence ();
13124   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13125
13126   op_mode = GET_MODE (op0);
13127   if (op_mode == VOIDmode)
13128     op_mode = GET_MODE (op1);
13129
13130   switch (op_mode)
13131     {
13132     case QImode:
13133     case HImode:
13134     case SImode:
13135       cmp_mode = SImode;
13136       icode = CODE_FOR_cmpsi;
13137       break;
13138
13139     case DImode:
13140       cmp_mode = DImode;
13141       icode = CODE_FOR_cmpdi;
13142       break;
13143
13144     default:
13145       end_sequence ();
13146       return NULL_RTX;
13147     }
13148
13149   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13150   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13151   if (!op0 || !op1)
13152     {
13153       end_sequence ();
13154       return NULL_RTX;
13155     }
13156   *prep_seq = get_insns ();
13157   end_sequence ();
13158
13159   cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
13160   target = gen_rtx_REG (CCmode, CC_REGNUM);
13161
13162   create_output_operand (&ops[0], target, CCmode);
13163   create_fixed_operand (&ops[1], cmp);
13164   create_fixed_operand (&ops[2], op0);
13165   create_fixed_operand (&ops[3], op1);
13166
13167   start_sequence ();
13168   if (!maybe_expand_insn (icode, 4, ops))
13169     {
13170       end_sequence ();
13171       return NULL_RTX;
13172     }
13173   *gen_seq = get_insns ();
13174   end_sequence ();
13175
13176   return gen_rtx_REG (cc_mode, CC_REGNUM);
13177 }
13178
13179 static rtx
13180 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
13181                        tree treeop0, tree treeop1, int bit_code)
13182 {
13183   rtx op0, op1, cmp0, cmp1, target;
13184   enum machine_mode op_mode, cmp_mode, cc_mode;
13185   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13186   enum insn_code icode = CODE_FOR_ccmp_andsi;
13187   struct expand_operand ops[6];
13188
13189   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
13190   if (cc_mode == CCmode)
13191     return NULL_RTX;
13192
13193   push_to_sequence ((rtx_insn*) *prep_seq);
13194   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13195
13196   op_mode = GET_MODE (op0);
13197   if (op_mode == VOIDmode)
13198     op_mode = GET_MODE (op1);
13199
13200   switch (op_mode)
13201     {
13202     case QImode:
13203     case HImode:
13204     case SImode:
13205       cmp_mode = SImode;
13206       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
13207                                                 : CODE_FOR_ccmp_iorsi;
13208       break;
13209
13210     case DImode:
13211       cmp_mode = DImode;
13212       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
13213                                                 : CODE_FOR_ccmp_iordi;
13214       break;
13215
13216     default:
13217       end_sequence ();
13218       return NULL_RTX;
13219     }
13220
13221   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13222   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13223   if (!op0 || !op1)
13224     {
13225       end_sequence ();
13226       return NULL_RTX;
13227     }
13228   *prep_seq = get_insns ();
13229   end_sequence ();
13230
13231   target = gen_rtx_REG (cc_mode, CC_REGNUM);
13232   cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
13233   cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
13234
13235   create_fixed_operand (&ops[0], prev);
13236   create_fixed_operand (&ops[1], target);
13237   create_fixed_operand (&ops[2], op0);
13238   create_fixed_operand (&ops[3], op1);
13239   create_fixed_operand (&ops[4], cmp0);
13240   create_fixed_operand (&ops[5], cmp1);
13241
13242   push_to_sequence ((rtx_insn*) *gen_seq);
13243   if (!maybe_expand_insn (icode, 6, ops))
13244     {
13245       end_sequence ();
13246       return NULL_RTX;
13247     }
13248
13249   *gen_seq = get_insns ();
13250   end_sequence ();
13251
13252   return target;
13253 }
13254
13255 #undef TARGET_GEN_CCMP_FIRST
13256 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13257
13258 #undef TARGET_GEN_CCMP_NEXT
13259 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13260
13261 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
13262    instruction fusion of some sort.  */
13263
13264 static bool
13265 aarch64_macro_fusion_p (void)
13266 {
13267   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13268 }
13269
13270
13271 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
13272    should be kept together during scheduling.  */
13273
13274 static bool
13275 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13276 {
13277   rtx set_dest;
13278   rtx prev_set = single_set (prev);
13279   rtx curr_set = single_set (curr);
13280   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
13281   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13282
13283   if (!aarch64_macro_fusion_p ())
13284     return false;
13285
13286   if (simple_sets_p
13287       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOV_MOVK))
13288     {
13289       /* We are trying to match:
13290          prev (mov)  == (set (reg r0) (const_int imm16))
13291          curr (movk) == (set (zero_extract (reg r0)
13292                                            (const_int 16)
13293                                            (const_int 16))
13294                              (const_int imm16_1))  */
13295
13296       set_dest = SET_DEST (curr_set);
13297
13298       if (GET_CODE (set_dest) == ZERO_EXTRACT
13299           && CONST_INT_P (SET_SRC (curr_set))
13300           && CONST_INT_P (SET_SRC (prev_set))
13301           && CONST_INT_P (XEXP (set_dest, 2))
13302           && INTVAL (XEXP (set_dest, 2)) == 16
13303           && REG_P (XEXP (set_dest, 0))
13304           && REG_P (SET_DEST (prev_set))
13305           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13306         {
13307           return true;
13308         }
13309     }
13310
13311   if (simple_sets_p
13312       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_ADD))
13313     {
13314
13315       /*  We're trying to match:
13316           prev (adrp) == (set (reg r1)
13317                               (high (symbol_ref ("SYM"))))
13318           curr (add) == (set (reg r0)
13319                              (lo_sum (reg r1)
13320                                      (symbol_ref ("SYM"))))
13321           Note that r0 need not necessarily be the same as r1, especially
13322           during pre-regalloc scheduling.  */
13323
13324       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13325           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13326         {
13327           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13328               && REG_P (XEXP (SET_SRC (curr_set), 0))
13329               && REGNO (XEXP (SET_SRC (curr_set), 0))
13330                  == REGNO (SET_DEST (prev_set))
13331               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13332                               XEXP (SET_SRC (curr_set), 1)))
13333             return true;
13334         }
13335     }
13336
13337   if (simple_sets_p
13338       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOVK_MOVK))
13339     {
13340
13341       /* We're trying to match:
13342          prev (movk) == (set (zero_extract (reg r0)
13343                                            (const_int 16)
13344                                            (const_int 32))
13345                              (const_int imm16_1))
13346          curr (movk) == (set (zero_extract (reg r0)
13347                                            (const_int 16)
13348                                            (const_int 48))
13349                              (const_int imm16_2))  */
13350
13351       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13352           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13353           && REG_P (XEXP (SET_DEST (prev_set), 0))
13354           && REG_P (XEXP (SET_DEST (curr_set), 0))
13355           && REGNO (XEXP (SET_DEST (prev_set), 0))
13356              == REGNO (XEXP (SET_DEST (curr_set), 0))
13357           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13358           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13359           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13360           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13361           && CONST_INT_P (SET_SRC (prev_set))
13362           && CONST_INT_P (SET_SRC (curr_set)))
13363         return true;
13364
13365     }
13366   if (simple_sets_p
13367       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_LDR))
13368     {
13369       /* We're trying to match:
13370           prev (adrp) == (set (reg r0)
13371                               (high (symbol_ref ("SYM"))))
13372           curr (ldr) == (set (reg r1)
13373                              (mem (lo_sum (reg r0)
13374                                              (symbol_ref ("SYM")))))
13375                  or
13376           curr (ldr) == (set (reg r1)
13377                              (zero_extend (mem
13378                                            (lo_sum (reg r0)
13379                                                    (symbol_ref ("SYM"))))))  */
13380       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13381           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13382         {
13383           rtx curr_src = SET_SRC (curr_set);
13384
13385           if (GET_CODE (curr_src) == ZERO_EXTEND)
13386             curr_src = XEXP (curr_src, 0);
13387
13388           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13389               && REG_P (XEXP (XEXP (curr_src, 0), 0))
13390               && REGNO (XEXP (XEXP (curr_src, 0), 0))
13391                  == REGNO (SET_DEST (prev_set))
13392               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13393                               XEXP (SET_SRC (prev_set), 0)))
13394               return true;
13395         }
13396     }
13397
13398   if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
13399       && any_condjump_p (curr))
13400     {
13401       enum attr_type prev_type = get_attr_type (prev);
13402
13403       /* FIXME: this misses some which is considered simple arthematic
13404          instructions for ThunderX.  Simple shifts are missed here.  */
13405       if (prev_type == TYPE_ALUS_SREG
13406           || prev_type == TYPE_ALUS_IMM
13407           || prev_type == TYPE_LOGICS_REG
13408           || prev_type == TYPE_LOGICS_IMM)
13409         return true;
13410     }
13411
13412   return false;
13413 }
13414
13415 /* If MEM is in the form of [base+offset], extract the two parts
13416    of address and set to BASE and OFFSET, otherwise return false
13417    after clearing BASE and OFFSET.  */
13418
13419 bool
13420 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13421 {
13422   rtx addr;
13423
13424   gcc_assert (MEM_P (mem));
13425
13426   addr = XEXP (mem, 0);
13427
13428   if (REG_P (addr))
13429     {
13430       *base = addr;
13431       *offset = const0_rtx;
13432       return true;
13433     }
13434
13435   if (GET_CODE (addr) == PLUS
13436       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13437     {
13438       *base = XEXP (addr, 0);
13439       *offset = XEXP (addr, 1);
13440       return true;
13441     }
13442
13443   *base = NULL_RTX;
13444   *offset = NULL_RTX;
13445
13446   return false;
13447 }
13448
13449 /* Types for scheduling fusion.  */
13450 enum sched_fusion_type
13451 {
13452   SCHED_FUSION_NONE = 0,
13453   SCHED_FUSION_LD_SIGN_EXTEND,
13454   SCHED_FUSION_LD_ZERO_EXTEND,
13455   SCHED_FUSION_LD,
13456   SCHED_FUSION_ST,
13457   SCHED_FUSION_NUM
13458 };
13459
13460 /* If INSN is a load or store of address in the form of [base+offset],
13461    extract the two parts and set to BASE and OFFSET.  Return scheduling
13462    fusion type this INSN is.  */
13463
13464 static enum sched_fusion_type
13465 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13466 {
13467   rtx x, dest, src;
13468   enum sched_fusion_type fusion = SCHED_FUSION_LD;
13469
13470   gcc_assert (INSN_P (insn));
13471   x = PATTERN (insn);
13472   if (GET_CODE (x) != SET)
13473     return SCHED_FUSION_NONE;
13474
13475   src = SET_SRC (x);
13476   dest = SET_DEST (x);
13477
13478   machine_mode dest_mode = GET_MODE (dest);
13479
13480   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13481     return SCHED_FUSION_NONE;
13482
13483   if (GET_CODE (src) == SIGN_EXTEND)
13484     {
13485       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
13486       src = XEXP (src, 0);
13487       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13488         return SCHED_FUSION_NONE;
13489     }
13490   else if (GET_CODE (src) == ZERO_EXTEND)
13491     {
13492       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13493       src = XEXP (src, 0);
13494       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13495         return SCHED_FUSION_NONE;
13496     }
13497
13498   if (GET_CODE (src) == MEM && REG_P (dest))
13499     extract_base_offset_in_addr (src, base, offset);
13500   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13501     {
13502       fusion = SCHED_FUSION_ST;
13503       extract_base_offset_in_addr (dest, base, offset);
13504     }
13505   else
13506     return SCHED_FUSION_NONE;
13507
13508   if (*base == NULL_RTX || *offset == NULL_RTX)
13509     fusion = SCHED_FUSION_NONE;
13510
13511   return fusion;
13512 }
13513
13514 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13515
13516    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13517    and PRI are only calculated for these instructions.  For other instruction,
13518    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
13519    type instruction fusion can be added by returning different priorities.
13520
13521    It's important that irrelevant instructions get the largest FUSION_PRI.  */
13522
13523 static void
13524 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13525                                int *fusion_pri, int *pri)
13526 {
13527   int tmp, off_val;
13528   rtx base, offset;
13529   enum sched_fusion_type fusion;
13530
13531   gcc_assert (INSN_P (insn));
13532
13533   tmp = max_pri - 1;
13534   fusion = fusion_load_store (insn, &base, &offset);
13535   if (fusion == SCHED_FUSION_NONE)
13536     {
13537       *pri = tmp;
13538       *fusion_pri = tmp;
13539       return;
13540     }
13541
13542   /* Set FUSION_PRI according to fusion type and base register.  */
13543   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
13544
13545   /* Calculate PRI.  */
13546   tmp /= 2;
13547
13548   /* INSN with smaller offset goes first.  */
13549   off_val = (int)(INTVAL (offset));
13550   if (off_val >= 0)
13551     tmp -= (off_val & 0xfffff);
13552   else
13553     tmp += ((- off_val) & 0xfffff);
13554
13555   *pri = tmp;
13556   return;
13557 }
13558
13559 /* Given OPERANDS of consecutive load/store, check if we can merge
13560    them into ldp/stp.  LOAD is true if they are load instructions.
13561    MODE is the mode of memory operands.  */
13562
13563 bool
13564 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
13565                                 enum machine_mode mode)
13566 {
13567   HOST_WIDE_INT offval_1, offval_2, msize;
13568   enum reg_class rclass_1, rclass_2;
13569   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
13570
13571   if (load)
13572     {
13573       mem_1 = operands[1];
13574       mem_2 = operands[3];
13575       reg_1 = operands[0];
13576       reg_2 = operands[2];
13577       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
13578       if (REGNO (reg_1) == REGNO (reg_2))
13579         return false;
13580     }
13581   else
13582     {
13583       mem_1 = operands[0];
13584       mem_2 = operands[2];
13585       reg_1 = operands[1];
13586       reg_2 = operands[3];
13587     }
13588
13589   /* The mems cannot be volatile.  */
13590   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
13591     return false;
13592
13593   /* Check if the addresses are in the form of [base+offset].  */
13594   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13595   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13596     return false;
13597   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13598   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13599     return false;
13600
13601   /* Check if the bases are same.  */
13602   if (!rtx_equal_p (base_1, base_2))
13603     return false;
13604
13605   offval_1 = INTVAL (offset_1);
13606   offval_2 = INTVAL (offset_2);
13607   msize = GET_MODE_SIZE (mode);
13608   /* Check if the offsets are consecutive.  */
13609   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
13610     return false;
13611
13612   /* Check if the addresses are clobbered by load.  */
13613   if (load)
13614     {
13615       if (reg_mentioned_p (reg_1, mem_1))
13616         return false;
13617
13618       /* In increasing order, the last load can clobber the address.  */
13619       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
13620       return false;
13621     }
13622
13623   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13624     rclass_1 = FP_REGS;
13625   else
13626     rclass_1 = GENERAL_REGS;
13627
13628   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13629     rclass_2 = FP_REGS;
13630   else
13631     rclass_2 = GENERAL_REGS;
13632
13633   /* Check if the registers are of same class.  */
13634   if (rclass_1 != rclass_2)
13635     return false;
13636
13637   return true;
13638 }
13639
13640 /* Given OPERANDS of consecutive load/store, check if we can merge
13641    them into ldp/stp by adjusting the offset.  LOAD is true if they
13642    are load instructions.  MODE is the mode of memory operands.
13643
13644    Given below consecutive stores:
13645
13646      str  w1, [xb, 0x100]
13647      str  w1, [xb, 0x104]
13648      str  w1, [xb, 0x108]
13649      str  w1, [xb, 0x10c]
13650
13651    Though the offsets are out of the range supported by stp, we can
13652    still pair them after adjusting the offset, like:
13653
13654      add  scratch, xb, 0x100
13655      stp  w1, w1, [scratch]
13656      stp  w1, w1, [scratch, 0x8]
13657
13658    The peephole patterns detecting this opportunity should guarantee
13659    the scratch register is avaliable.  */
13660
13661 bool
13662 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
13663                                        enum machine_mode mode)
13664 {
13665   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
13666   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
13667   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
13668   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
13669
13670   if (load)
13671     {
13672       reg_1 = operands[0];
13673       mem_1 = operands[1];
13674       reg_2 = operands[2];
13675       mem_2 = operands[3];
13676       reg_3 = operands[4];
13677       mem_3 = operands[5];
13678       reg_4 = operands[6];
13679       mem_4 = operands[7];
13680       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
13681                   && REG_P (reg_3) && REG_P (reg_4));
13682       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
13683         return false;
13684     }
13685   else
13686     {
13687       mem_1 = operands[0];
13688       reg_1 = operands[1];
13689       mem_2 = operands[2];
13690       reg_2 = operands[3];
13691       mem_3 = operands[4];
13692       reg_3 = operands[5];
13693       mem_4 = operands[6];
13694       reg_4 = operands[7];
13695     }
13696   /* Skip if memory operand is by itslef valid for ldp/stp.  */
13697   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
13698     return false;
13699
13700   /* The mems cannot be volatile.  */
13701   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
13702       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
13703     return false;
13704
13705   /* Check if the addresses are in the form of [base+offset].  */
13706   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13707   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13708     return false;
13709   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13710   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13711     return false;
13712   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
13713   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
13714     return false;
13715   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
13716   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
13717     return false;
13718
13719   /* Check if the bases are same.  */
13720   if (!rtx_equal_p (base_1, base_2)
13721       || !rtx_equal_p (base_2, base_3)
13722       || !rtx_equal_p (base_3, base_4))
13723     return false;
13724
13725   offval_1 = INTVAL (offset_1);
13726   offval_2 = INTVAL (offset_2);
13727   offval_3 = INTVAL (offset_3);
13728   offval_4 = INTVAL (offset_4);
13729   msize = GET_MODE_SIZE (mode);
13730   /* Check if the offsets are consecutive.  */
13731   if ((offval_1 != (offval_2 + msize)
13732        || offval_1 != (offval_3 + msize * 2)
13733        || offval_1 != (offval_4 + msize * 3))
13734       && (offval_4 != (offval_3 + msize)
13735           || offval_4 != (offval_2 + msize * 2)
13736           || offval_4 != (offval_1 + msize * 3)))
13737     return false;
13738
13739   /* Check if the addresses are clobbered by load.  */
13740   if (load)
13741     {
13742       if (reg_mentioned_p (reg_1, mem_1)
13743           || reg_mentioned_p (reg_2, mem_2)
13744           || reg_mentioned_p (reg_3, mem_3))
13745         return false;
13746
13747       /* In increasing order, the last load can clobber the address.  */
13748       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
13749         return false;
13750     }
13751
13752   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13753     rclass_1 = FP_REGS;
13754   else
13755     rclass_1 = GENERAL_REGS;
13756
13757   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13758     rclass_2 = FP_REGS;
13759   else
13760     rclass_2 = GENERAL_REGS;
13761
13762   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
13763     rclass_3 = FP_REGS;
13764   else
13765     rclass_3 = GENERAL_REGS;
13766
13767   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
13768     rclass_4 = FP_REGS;
13769   else
13770     rclass_4 = GENERAL_REGS;
13771
13772   /* Check if the registers are of same class.  */
13773   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
13774     return false;
13775
13776   return true;
13777 }
13778
13779 /* Given OPERANDS of consecutive load/store, this function pairs them
13780    into ldp/stp after adjusting the offset.  It depends on the fact
13781    that addresses of load/store instructions are in increasing order.
13782    MODE is the mode of memory operands.  CODE is the rtl operator
13783    which should be applied to all memory operands, it's SIGN_EXTEND,
13784    ZERO_EXTEND or UNKNOWN.  */
13785
13786 bool
13787 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
13788                              enum machine_mode mode, RTX_CODE code)
13789 {
13790   rtx base, offset, t1, t2;
13791   rtx mem_1, mem_2, mem_3, mem_4;
13792   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
13793
13794   if (load)
13795     {
13796       mem_1 = operands[1];
13797       mem_2 = operands[3];
13798       mem_3 = operands[5];
13799       mem_4 = operands[7];
13800     }
13801   else
13802     {
13803       mem_1 = operands[0];
13804       mem_2 = operands[2];
13805       mem_3 = operands[4];
13806       mem_4 = operands[6];
13807       gcc_assert (code == UNKNOWN);
13808     }
13809
13810   extract_base_offset_in_addr (mem_1, &base, &offset);
13811   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
13812
13813   /* Adjust offset thus it can fit in ldp/stp instruction.  */
13814   msize = GET_MODE_SIZE (mode);
13815   stp_off_limit = msize * 0x40;
13816   off_val = INTVAL (offset);
13817   abs_off = (off_val < 0) ? -off_val : off_val;
13818   new_off = abs_off % stp_off_limit;
13819   adj_off = abs_off - new_off;
13820
13821   /* Further adjust to make sure all offsets are OK.  */
13822   if ((new_off + msize * 2) >= stp_off_limit)
13823     {
13824       adj_off += stp_off_limit;
13825       new_off -= stp_off_limit;
13826     }
13827
13828   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
13829   if (adj_off >= 0x1000)
13830     return false;
13831
13832   if (off_val < 0)
13833     {
13834       adj_off = -adj_off;
13835       new_off = -new_off;
13836     }
13837
13838   /* Create new memory references.  */
13839   mem_1 = change_address (mem_1, VOIDmode,
13840                           plus_constant (DImode, operands[8], new_off));
13841
13842   /* Check if the adjusted address is OK for ldp/stp.  */
13843   if (!aarch64_mem_pair_operand (mem_1, mode))
13844     return false;
13845
13846   msize = GET_MODE_SIZE (mode);
13847   mem_2 = change_address (mem_2, VOIDmode,
13848                           plus_constant (DImode,
13849                                          operands[8],
13850                                          new_off + msize));
13851   mem_3 = change_address (mem_3, VOIDmode,
13852                           plus_constant (DImode,
13853                                          operands[8],
13854                                          new_off + msize * 2));
13855   mem_4 = change_address (mem_4, VOIDmode,
13856                           plus_constant (DImode,
13857                                          operands[8],
13858                                          new_off + msize * 3));
13859
13860   if (code == ZERO_EXTEND)
13861     {
13862       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
13863       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
13864       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
13865       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
13866     }
13867   else if (code == SIGN_EXTEND)
13868     {
13869       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
13870       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
13871       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
13872       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
13873     }
13874
13875   if (load)
13876     {
13877       operands[1] = mem_1;
13878       operands[3] = mem_2;
13879       operands[5] = mem_3;
13880       operands[7] = mem_4;
13881     }
13882   else
13883     {
13884       operands[0] = mem_1;
13885       operands[2] = mem_2;
13886       operands[4] = mem_3;
13887       operands[6] = mem_4;
13888     }
13889
13890   /* Emit adjusting instruction.  */
13891   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
13892   /* Emit ldp/stp instructions.  */
13893   t1 = gen_rtx_SET (operands[0], operands[1]);
13894   t2 = gen_rtx_SET (operands[2], operands[3]);
13895   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13896   t1 = gen_rtx_SET (operands[4], operands[5]);
13897   t2 = gen_rtx_SET (operands[6], operands[7]);
13898   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13899   return true;
13900 }
13901
13902 /* Return 1 if pseudo register should be created and used to hold
13903    GOT address for PIC code.  */
13904
13905 bool
13906 aarch64_use_pseudo_pic_reg (void)
13907 {
13908   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
13909 }
13910
13911 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
13912
13913 static int
13914 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
13915 {
13916   switch (XINT (x, 1))
13917     {
13918     case UNSPEC_GOTSMALLPIC:
13919     case UNSPEC_GOTSMALLPIC28K:
13920     case UNSPEC_GOTTINYPIC:
13921       return 0;
13922     default:
13923       break;
13924     }
13925
13926   return default_unspec_may_trap_p (x, flags);
13927 }
13928
13929
13930 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
13931    return the log2 of that value.  Otherwise return -1.  */
13932
13933 int
13934 aarch64_fpconst_pow_of_2 (rtx x)
13935 {
13936   const REAL_VALUE_TYPE *r;
13937
13938   if (!CONST_DOUBLE_P (x))
13939     return -1;
13940
13941   r = CONST_DOUBLE_REAL_VALUE (x);
13942
13943   if (REAL_VALUE_NEGATIVE (*r)
13944       || REAL_VALUE_ISNAN (*r)
13945       || REAL_VALUE_ISINF (*r)
13946       || !real_isinteger (r, DFmode))
13947     return -1;
13948
13949   return exact_log2 (real_to_integer (r));
13950 }
13951
13952 /* If X is a vector of equal CONST_DOUBLE values and that value is
13953    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
13954
13955 int
13956 aarch64_vec_fpconst_pow_of_2 (rtx x)
13957 {
13958   if (GET_CODE (x) != CONST_VECTOR)
13959     return -1;
13960
13961   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
13962     return -1;
13963
13964   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
13965   if (firstval <= 0)
13966     return -1;
13967
13968   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
13969     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
13970       return -1;
13971
13972   return firstval;
13973 }
13974
13975 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float.  */
13976 static tree
13977 aarch64_promoted_type (const_tree t)
13978 {
13979   if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
13980     return float_type_node;
13981   return NULL_TREE;
13982 }
13983
13984 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
13985
13986 static bool
13987 aarch64_optab_supported_p (int op, machine_mode, machine_mode,
13988                            optimization_type opt_type)
13989 {
13990   switch (op)
13991     {
13992     case rsqrt_optab:
13993       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
13994
13995     default:
13996       return true;
13997     }
13998 }
13999
14000 #undef TARGET_ADDRESS_COST
14001 #define TARGET_ADDRESS_COST aarch64_address_cost
14002
14003 /* This hook will determines whether unnamed bitfields affect the alignment
14004    of the containing structure.  The hook returns true if the structure
14005    should inherit the alignment requirements of an unnamed bitfield's
14006    type.  */
14007 #undef TARGET_ALIGN_ANON_BITFIELD
14008 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14009
14010 #undef TARGET_ASM_ALIGNED_DI_OP
14011 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14012
14013 #undef TARGET_ASM_ALIGNED_HI_OP
14014 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14015
14016 #undef TARGET_ASM_ALIGNED_SI_OP
14017 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14018
14019 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14020 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14021   hook_bool_const_tree_hwi_hwi_const_tree_true
14022
14023 #undef TARGET_ASM_OUTPUT_MI_THUNK
14024 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14025
14026 #undef TARGET_ASM_SELECT_RTX_SECTION
14027 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14028
14029 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14030 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14031
14032 #undef TARGET_BUILD_BUILTIN_VA_LIST
14033 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14034
14035 #undef TARGET_CALLEE_COPIES
14036 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14037
14038 #undef TARGET_CAN_ELIMINATE
14039 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14040
14041 #undef TARGET_CAN_INLINE_P
14042 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14043
14044 #undef TARGET_CANNOT_FORCE_CONST_MEM
14045 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14046
14047 #undef TARGET_CASE_VALUES_THRESHOLD
14048 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14049
14050 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14051 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14052
14053 /* Only the least significant bit is used for initialization guard
14054    variables.  */
14055 #undef TARGET_CXX_GUARD_MASK_BIT
14056 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14057
14058 #undef TARGET_C_MODE_FOR_SUFFIX
14059 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14060
14061 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14062 #undef  TARGET_DEFAULT_TARGET_FLAGS
14063 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14064 #endif
14065
14066 #undef TARGET_CLASS_MAX_NREGS
14067 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14068
14069 #undef TARGET_BUILTIN_DECL
14070 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14071
14072 #undef TARGET_BUILTIN_RECIPROCAL
14073 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14074
14075 #undef  TARGET_EXPAND_BUILTIN
14076 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14077
14078 #undef TARGET_EXPAND_BUILTIN_VA_START
14079 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14080
14081 #undef TARGET_FOLD_BUILTIN
14082 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14083
14084 #undef TARGET_FUNCTION_ARG
14085 #define TARGET_FUNCTION_ARG aarch64_function_arg
14086
14087 #undef TARGET_FUNCTION_ARG_ADVANCE
14088 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14089
14090 #undef TARGET_FUNCTION_ARG_BOUNDARY
14091 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14092
14093 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14094 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14095
14096 #undef TARGET_FUNCTION_VALUE
14097 #define TARGET_FUNCTION_VALUE aarch64_function_value
14098
14099 #undef TARGET_FUNCTION_VALUE_REGNO_P
14100 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14101
14102 #undef TARGET_FRAME_POINTER_REQUIRED
14103 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14104
14105 #undef TARGET_GIMPLE_FOLD_BUILTIN
14106 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14107
14108 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14109 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14110
14111 #undef  TARGET_INIT_BUILTINS
14112 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
14113
14114 #undef TARGET_LEGITIMATE_ADDRESS_P
14115 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14116
14117 #undef TARGET_LEGITIMATE_CONSTANT_P
14118 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14119
14120 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14121 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14122
14123 #undef TARGET_LRA_P
14124 #define TARGET_LRA_P hook_bool_void_true
14125
14126 #undef TARGET_MANGLE_TYPE
14127 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14128
14129 #undef TARGET_MEMORY_MOVE_COST
14130 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14131
14132 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14133 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14134
14135 #undef TARGET_MUST_PASS_IN_STACK
14136 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14137
14138 /* This target hook should return true if accesses to volatile bitfields
14139    should use the narrowest mode possible.  It should return false if these
14140    accesses should use the bitfield container type.  */
14141 #undef TARGET_NARROW_VOLATILE_BITFIELD
14142 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14143
14144 #undef  TARGET_OPTION_OVERRIDE
14145 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14146
14147 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14148 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14149   aarch64_override_options_after_change
14150
14151 #undef TARGET_OPTION_SAVE
14152 #define TARGET_OPTION_SAVE aarch64_option_save
14153
14154 #undef TARGET_OPTION_RESTORE
14155 #define TARGET_OPTION_RESTORE aarch64_option_restore
14156
14157 #undef TARGET_OPTION_PRINT
14158 #define TARGET_OPTION_PRINT aarch64_option_print
14159
14160 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14161 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14162
14163 #undef TARGET_SET_CURRENT_FUNCTION
14164 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14165
14166 #undef TARGET_PASS_BY_REFERENCE
14167 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14168
14169 #undef TARGET_PREFERRED_RELOAD_CLASS
14170 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14171
14172 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14173 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14174
14175 #undef TARGET_PROMOTED_TYPE
14176 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14177
14178 #undef TARGET_SECONDARY_RELOAD
14179 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14180
14181 #undef TARGET_SHIFT_TRUNCATION_MASK
14182 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14183
14184 #undef TARGET_SETUP_INCOMING_VARARGS
14185 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14186
14187 #undef TARGET_STRUCT_VALUE_RTX
14188 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
14189
14190 #undef TARGET_REGISTER_MOVE_COST
14191 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14192
14193 #undef TARGET_RETURN_IN_MEMORY
14194 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14195
14196 #undef TARGET_RETURN_IN_MSB
14197 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14198
14199 #undef TARGET_RTX_COSTS
14200 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14201
14202 #undef TARGET_SCHED_ISSUE_RATE
14203 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14204
14205 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14206 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14207   aarch64_sched_first_cycle_multipass_dfa_lookahead
14208
14209 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14210 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14211   aarch64_first_cycle_multipass_dfa_lookahead_guard
14212
14213 #undef TARGET_TRAMPOLINE_INIT
14214 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14215
14216 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14217 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14218
14219 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14220 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14221
14222 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14223 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14224
14225 #undef TARGET_VECTORIZE_ADD_STMT_COST
14226 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14227
14228 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14229 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14230   aarch64_builtin_vectorization_cost
14231
14232 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14233 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14234
14235 #undef TARGET_VECTORIZE_BUILTINS
14236 #define TARGET_VECTORIZE_BUILTINS
14237
14238 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14239 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14240   aarch64_builtin_vectorized_function
14241
14242 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14243 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14244   aarch64_autovectorize_vector_sizes
14245
14246 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14247 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14248   aarch64_atomic_assign_expand_fenv
14249
14250 /* Section anchor support.  */
14251
14252 #undef TARGET_MIN_ANCHOR_OFFSET
14253 #define TARGET_MIN_ANCHOR_OFFSET -256
14254
14255 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14256    byte offset; we can do much more for larger data types, but have no way
14257    to determine the size of the access.  We assume accesses are aligned.  */
14258 #undef TARGET_MAX_ANCHOR_OFFSET
14259 #define TARGET_MAX_ANCHOR_OFFSET 4095
14260
14261 #undef TARGET_VECTOR_ALIGNMENT
14262 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14263
14264 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14265 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14266   aarch64_simd_vector_alignment_reachable
14267
14268 /* vec_perm support.  */
14269
14270 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14271 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14272   aarch64_vectorize_vec_perm_const_ok
14273
14274 #undef TARGET_INIT_LIBFUNCS
14275 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14276
14277 #undef TARGET_FIXED_CONDITION_CODE_REGS
14278 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14279
14280 #undef TARGET_FLAGS_REGNUM
14281 #define TARGET_FLAGS_REGNUM CC_REGNUM
14282
14283 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14284 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14285
14286 #undef TARGET_ASAN_SHADOW_OFFSET
14287 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14288
14289 #undef TARGET_LEGITIMIZE_ADDRESS
14290 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14291
14292 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14293 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14294   aarch64_use_by_pieces_infrastructure_p
14295
14296 #undef TARGET_CAN_USE_DOLOOP_P
14297 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14298
14299 #undef TARGET_SCHED_MACRO_FUSION_P
14300 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14301
14302 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14303 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14304
14305 #undef TARGET_SCHED_FUSION_PRIORITY
14306 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14307
14308 #undef TARGET_UNSPEC_MAY_TRAP_P
14309 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14310
14311 #undef TARGET_USE_PSEUDO_PIC_REG
14312 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14313
14314 #undef TARGET_PRINT_OPERAND
14315 #define TARGET_PRINT_OPERAND aarch64_print_operand
14316
14317 #undef TARGET_PRINT_OPERAND_ADDRESS
14318 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14319
14320 #undef TARGET_OPTAB_SUPPORTED_P
14321 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14322
14323 struct gcc_target targetm = TARGET_INITIALIZER;
14324
14325 #include "gt-aarch64.h"