gcc/config/i386/x86-tune-costs.h

   1 /* Costs of operations of individual x86 CPUs.
   2    Copyright (C) 1988-2024 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3, or (at your option)
   9 any later version.
  10
  11 GCC is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 Under Section 7 of GPL version 3, you are granted additional
  17 permissions described in the GCC Runtime Library Exception, version
  18 3.1, as published by the Free Software Foundation.
  19
  20 You should have received a copy of the GNU General Public License and
  21 a copy of the GCC Runtime Library Exception along with this program;
  22 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23 <http://www.gnu.org/licenses/>.  */
  24 /* Processor costs (relative to an add) */
  25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
  26 #define COSTS_N_BYTES(N) ((N) * 2)
  27
  28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
  29
  30 static stringop_algs ix86_size_memcpy[2] = {
  31   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  32   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  33 static stringop_algs ix86_size_memset[2] = {
  34   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  35   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  36
  37 const
  38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
  39   {
  40   /* Start of register allocator costs.  integer->integer move cost is 2. */
  41   2,                                 /* cost for loading QImode using movzbl */
  42   {2, 2, 2},                            /* cost of loading integer registers
  43                                            in QImode, HImode and SImode.
  44                                            Relative to reg-reg move (2).  */
  45   {2, 2, 2},                            /* cost of storing integer registers */
  46   2,                                    /* cost of reg,reg fld/fst */
  47   {2, 2, 2},                            /* cost of loading fp registers
  48                                            in SFmode, DFmode and XFmode */
  49   {2, 2, 2},                            /* cost of storing fp registers
  50                                            in SFmode, DFmode and XFmode */
  51   3,                                    /* cost of moving MMX register */
  52   {3, 3},                               /* cost of loading MMX registers
  53                                            in SImode and DImode */
  54   {3, 3},                               /* cost of storing MMX registers
  55                                            in SImode and DImode */
  56   3, 3, 3,                              /* cost of moving XMM,YMM,ZMM register */
  57   {3, 3, 3, 3, 3},                      /* cost of loading SSE registers
  58                                            in 32,64,128,256 and 512-bit */
  59   {3, 3, 3, 3, 3},                      /* cost of storing SSE registers
  60                                            in 32,64,128,256 and 512-bit */
  61   3, 3,                         /* SSE->integer and integer->SSE moves */
  62   3, 3,                         /* mask->integer and integer->mask moves */
  63   {2, 2, 2},                            /* cost of loading mask register
  64                                            in QImode, HImode, SImode.  */
  65   {2, 2, 2},                            /* cost if storing mask register
  66                                            in QImode, HImode, SImode.  */
  67   2,                                    /* cost of moving mask register.  */
  68   /* End of register allocator costs.  */
  69   },
  70
  71   COSTS_N_BYTES (2),                    /* cost of an add instruction */
  72   COSTS_N_BYTES (3),                    /* cost of a lea instruction */
  73   COSTS_N_BYTES (2),                    /* variable shift costs */
  74   COSTS_N_BYTES (3),                    /* constant shift costs */
  75   {COSTS_N_BYTES (3),                   /* cost of starting multiply for QI */
  76    COSTS_N_BYTES (3),                   /*                               HI */
  77    COSTS_N_BYTES (3),                   /*                               SI */
  78    COSTS_N_BYTES (3),                   /*                               DI */
  79    COSTS_N_BYTES (5)},                  /*                            other */
  80   0,                                    /* cost of multiply per each bit set */
  81   {COSTS_N_BYTES (3),                   /* cost of a divide/mod for QI */
  82    COSTS_N_BYTES (3),                   /*                          HI */
  83    COSTS_N_BYTES (3),                   /*                          SI */
  84    COSTS_N_BYTES (3),                   /*                          DI */
  85    COSTS_N_BYTES (5)},                  /*                          other */
  86   COSTS_N_BYTES (3),                    /* cost of movsx */
  87   COSTS_N_BYTES (3),                    /* cost of movzx */
  88   0,                                    /* "large" insn */
  89   2,                                    /* MOVE_RATIO */
  90   2,                                    /* CLEAR_RATIO */
  91   {2, 2, 2},                            /* cost of loading integer registers
  92                                            in QImode, HImode and SImode.
  93                                            Relative to reg-reg move (2).  */
  94   {2, 2, 2},                            /* cost of storing integer registers */
  95   {3, 3, 3, 3, 3},                      /* cost of loading SSE register
  96                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
  97   {3, 3, 3, 3, 3},                      /* cost of storing SSE register
  98                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
  99   {3, 3, 3, 3, 3},                      /* cost of unaligned SSE load
 100                                            in 128bit, 256bit and 512bit */
 101   {3, 3, 3, 3, 3},                      /* cost of unaligned SSE store
 102                                            in 128bit, 256bit and 512bit */
 103   3, 3, 3,                              /* cost of moving XMM,YMM,ZMM register */
 104   3,                                    /* cost of moving SSE register to integer.  */
 105   5, 0,                                 /* Gather load static, per_elt.  */
 106   5, 0,                                 /* Gather store static, per_elt.  */
 107   0,                                    /* size of l1 cache  */
 108   0,                                    /* size of l2 cache  */
 109   0,                                    /* size of prefetch block */
 110   0,                                    /* number of parallel prefetches */
 111   2,                                    /* Branch cost */
 112   COSTS_N_BYTES (2),                    /* cost of FADD and FSUB insns.  */
 113   COSTS_N_BYTES (2),                    /* cost of FMUL instruction.  */
 114   COSTS_N_BYTES (2),                    /* cost of FDIV instruction.  */
 115   COSTS_N_BYTES (2),                    /* cost of FABS instruction.  */
 116   COSTS_N_BYTES (2),                    /* cost of FCHS instruction.  */
 117   COSTS_N_BYTES (2),                    /* cost of FSQRT instruction.  */
 118
 119   COSTS_N_BYTES (2),                    /* cost of cheap SSE instruction.  */
 120   COSTS_N_BYTES (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 121   COSTS_N_BYTES (2),                    /* cost of MULSS instruction.  */
 122   COSTS_N_BYTES (2),                    /* cost of MULSD instruction.  */
 123   COSTS_N_BYTES (2),                    /* cost of FMA SS instruction.  */
 124   COSTS_N_BYTES (2),                    /* cost of FMA SD instruction.  */
 125   COSTS_N_BYTES (2),                    /* cost of DIVSS instruction.  */
 126   COSTS_N_BYTES (2),                    /* cost of DIVSD instruction.  */
 127   COSTS_N_BYTES (2),                    /* cost of SQRTSS instruction.  */
 128   COSTS_N_BYTES (2),                    /* cost of SQRTSD instruction.  */
 129   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 130   ix86_size_memcpy,
 131   ix86_size_memset,
 132   COSTS_N_BYTES (1),                    /* cond_taken_branch_cost.  */
 133   COSTS_N_BYTES (1),                    /* cond_not_taken_branch_cost.  */
 134   NULL,                                 /* Loop alignment.  */
 135   NULL,                                 /* Jump alignment.  */
 136   NULL,                                 /* Label alignment.  */
 137   NULL,                                 /* Func alignment.  */
 138   4,                                    /* Small unroll limit.  */
 139   2,                                    /* Small unroll factor.  */
 140 };
 141
 142 /* Processor costs (relative to an add) */
 143 static stringop_algs i386_memcpy[2] = {
 144   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 145   DUMMY_STRINGOP_ALGS};
 146 static stringop_algs i386_memset[2] = {
 147   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 148   DUMMY_STRINGOP_ALGS};
 149
 150 static const
 151 struct processor_costs i386_cost = {    /* 386 specific costs */
 152   {
 153   /* Start of register allocator costs.  integer->integer move cost is 2. */
 154   4,                                 /* cost for loading QImode using movzbl */
 155   {2, 4, 2},                            /* cost of loading integer registers
 156                                            in QImode, HImode and SImode.
 157                                            Relative to reg-reg move (2).  */
 158   {2, 4, 2},                            /* cost of storing integer registers */
 159   2,                                    /* cost of reg,reg fld/fst */
 160   {8, 8, 8},                            /* cost of loading fp registers
 161                                            in SFmode, DFmode and XFmode */
 162   {8, 8, 8},                            /* cost of storing fp registers
 163                                            in SFmode, DFmode and XFmode */
 164   2,                                    /* cost of moving MMX register */
 165   {4, 8},                               /* cost of loading MMX registers
 166                                            in SImode and DImode */
 167   {4, 8},                               /* cost of storing MMX registers
 168                                            in SImode and DImode */
 169   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 170   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 171                                            in 32,64,128,256 and 512-bit */
 172   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 173                                            in 32,64,128,256 and 512-bit */
 174   3, 3,                         /* SSE->integer and integer->SSE moves */
 175   3, 3,                         /* mask->integer and integer->mask moves */
 176   {2, 4, 2},                            /* cost of loading mask register
 177                                            in QImode, HImode, SImode.  */
 178   {2, 4, 2},                            /* cost if storing mask register
 179                                            in QImode, HImode, SImode.  */
 180   2,                                    /* cost of moving mask register.  */
 181   /* End of register allocator costs.  */
 182   },
 183
 184   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 185   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 186   COSTS_N_INSNS (3),                    /* variable shift costs */
 187   COSTS_N_INSNS (2),                    /* constant shift costs */
 188   {COSTS_N_INSNS (6),                   /* cost of starting multiply for QI */
 189    COSTS_N_INSNS (6),                   /*                               HI */
 190    COSTS_N_INSNS (6),                   /*                               SI */
 191    COSTS_N_INSNS (6),                   /*                               DI */
 192    COSTS_N_INSNS (6)},                  /*                            other */
 193   COSTS_N_INSNS (1),                    /* cost of multiply per each bit set */
 194   {COSTS_N_INSNS (23),                  /* cost of a divide/mod for QI */
 195    COSTS_N_INSNS (23),                  /*                          HI */
 196    COSTS_N_INSNS (23),                  /*                          SI */
 197    COSTS_N_INSNS (23),                  /*                          DI */
 198    COSTS_N_INSNS (23)},                 /*                          other */
 199   COSTS_N_INSNS (3),                    /* cost of movsx */
 200   COSTS_N_INSNS (2),                    /* cost of movzx */
 201   15,                                   /* "large" insn */
 202   3,                                    /* MOVE_RATIO */
 203   3,                                    /* CLEAR_RATIO */
 204   {2, 4, 2},                            /* cost of loading integer registers
 205                                            in QImode, HImode and SImode.
 206                                            Relative to reg-reg move (2).  */
 207   {2, 4, 2},                            /* cost of storing integer registers */
 208   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 209                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 210   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 211                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 212   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 213   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 214   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 215   3,                                    /* cost of moving SSE register to integer.  */
 216   4, 4,                                 /* Gather load static, per_elt.  */
 217   4, 4,                                 /* Gather store static, per_elt.  */
 218   0,                                    /* size of l1 cache  */
 219   0,                                    /* size of l2 cache  */
 220   0,                                    /* size of prefetch block */
 221   0,                                    /* number of parallel prefetches */
 222   1,                                    /* Branch cost */
 223   COSTS_N_INSNS (23),                   /* cost of FADD and FSUB insns.  */
 224   COSTS_N_INSNS (27),                   /* cost of FMUL instruction.  */
 225   COSTS_N_INSNS (88),                   /* cost of FDIV instruction.  */
 226   COSTS_N_INSNS (22),                   /* cost of FABS instruction.  */
 227   COSTS_N_INSNS (24),                   /* cost of FCHS instruction.  */
 228   COSTS_N_INSNS (122),                  /* cost of FSQRT instruction.  */
 229
 230   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 231   COSTS_N_INSNS (23),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
 232   COSTS_N_INSNS (27),                   /* cost of MULSS instruction.  */
 233   COSTS_N_INSNS (27),                   /* cost of MULSD instruction.  */
 234   COSTS_N_INSNS (27),                   /* cost of FMA SS instruction.  */
 235   COSTS_N_INSNS (27),                   /* cost of FMA SD instruction.  */
 236   COSTS_N_INSNS (88),                   /* cost of DIVSS instruction.  */
 237   COSTS_N_INSNS (88),                   /* cost of DIVSD instruction.  */
 238   COSTS_N_INSNS (122),                  /* cost of SQRTSS instruction.  */
 239   COSTS_N_INSNS (122),                  /* cost of SQRTSD instruction.  */
 240   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 241   i386_memcpy,
 242   i386_memset,
 243   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 244   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 245   "4",                                  /* Loop alignment.  */
 246   "4",                                  /* Jump alignment.  */
 247   NULL,                                 /* Label alignment.  */
 248   "4",                                  /* Func alignment.  */
 249   4,                                    /* Small unroll limit.  */
 250   2,                                    /* Small unroll factor.  */
 251 };
 252
 253 static stringop_algs i486_memcpy[2] = {
 254   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 255   DUMMY_STRINGOP_ALGS};
 256 static stringop_algs i486_memset[2] = {
 257   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 258   DUMMY_STRINGOP_ALGS};
 259
 260 static const
 261 struct processor_costs i486_cost = {    /* 486 specific costs */
 262   {
 263   /* Start of register allocator costs.  integer->integer move cost is 2. */
 264   4,                                 /* cost for loading QImode using movzbl */
 265   {2, 4, 2},                            /* cost of loading integer registers
 266                                            in QImode, HImode and SImode.
 267                                            Relative to reg-reg move (2).  */
 268   {2, 4, 2},                            /* cost of storing integer registers */
 269   2,                                    /* cost of reg,reg fld/fst */
 270   {8, 8, 8},                            /* cost of loading fp registers
 271                                            in SFmode, DFmode and XFmode */
 272   {8, 8, 8},                            /* cost of storing fp registers
 273                                            in SFmode, DFmode and XFmode */
 274   2,                                    /* cost of moving MMX register */
 275   {4, 8},                               /* cost of loading MMX registers
 276                                            in SImode and DImode */
 277   {4, 8},                               /* cost of storing MMX registers
 278                                            in SImode and DImode */
 279   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 280   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 281                                            in 32,64,128,256 and 512-bit */
 282   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 283                                            in 32,64,128,256 and 512-bit */
 284   3, 3,                         /* SSE->integer and integer->SSE moves */
 285   3, 3,                         /* mask->integer and integer->mask moves */
 286   {2, 4, 2},                            /* cost of loading mask register
 287                                            in QImode, HImode, SImode.  */
 288   {2, 4, 2},                            /* cost if storing mask register
 289                                            in QImode, HImode, SImode.  */
 290   2,                                    /* cost of moving mask register.  */
 291   /* End of register allocator costs.  */
 292   },
 293
 294   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 295   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 296   COSTS_N_INSNS (3),                    /* variable shift costs */
 297   COSTS_N_INSNS (2),                    /* constant shift costs */
 298   {COSTS_N_INSNS (12),                  /* cost of starting multiply for QI */
 299    COSTS_N_INSNS (12),                  /*                               HI */
 300    COSTS_N_INSNS (12),                  /*                               SI */
 301    COSTS_N_INSNS (12),                  /*                               DI */
 302    COSTS_N_INSNS (12)},                 /*                            other */
 303   1,                                    /* cost of multiply per each bit set */
 304   {COSTS_N_INSNS (40),                  /* cost of a divide/mod for QI */
 305    COSTS_N_INSNS (40),                  /*                          HI */
 306    COSTS_N_INSNS (40),                  /*                          SI */
 307    COSTS_N_INSNS (40),                  /*                          DI */
 308    COSTS_N_INSNS (40)},                 /*                          other */
 309   COSTS_N_INSNS (3),                    /* cost of movsx */
 310   COSTS_N_INSNS (2),                    /* cost of movzx */
 311   15,                                   /* "large" insn */
 312   3,                                    /* MOVE_RATIO */
 313   3,                                    /* CLEAR_RATIO */
 314   {2, 4, 2},                            /* cost of loading integer registers
 315                                            in QImode, HImode and SImode.
 316                                            Relative to reg-reg move (2).  */
 317   {2, 4, 2},                            /* cost of storing integer registers */
 318   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 319                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 320   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 321                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 322   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 323   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 324   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 325   3,                                    /* cost of moving SSE register to integer.  */
 326   4, 4,                                 /* Gather load static, per_elt.  */
 327   4, 4,                                 /* Gather store static, per_elt.  */
 328   4,                                    /* size of l1 cache.  486 has 8kB cache
 329                                            shared for code and data, so 4kB is
 330                                            not really precise.  */
 331   4,                                    /* size of l2 cache  */
 332   0,                                    /* size of prefetch block */
 333   0,                                    /* number of parallel prefetches */
 334   1,                                    /* Branch cost */
 335   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
 336   COSTS_N_INSNS (16),                   /* cost of FMUL instruction.  */
 337   COSTS_N_INSNS (73),                   /* cost of FDIV instruction.  */
 338   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
 339   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
 340   COSTS_N_INSNS (83),                   /* cost of FSQRT instruction.  */
 341
 342   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 343   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 344   COSTS_N_INSNS (16),                   /* cost of MULSS instruction.  */
 345   COSTS_N_INSNS (16),                   /* cost of MULSD instruction.  */
 346   COSTS_N_INSNS (16),                   /* cost of FMA SS instruction.  */
 347   COSTS_N_INSNS (16),                   /* cost of FMA SD instruction.  */
 348   COSTS_N_INSNS (73),                   /* cost of DIVSS instruction.  */
 349   COSTS_N_INSNS (74),                   /* cost of DIVSD instruction.  */
 350   COSTS_N_INSNS (83),                   /* cost of SQRTSS instruction.  */
 351   COSTS_N_INSNS (83),                   /* cost of SQRTSD instruction.  */
 352   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 353   i486_memcpy,
 354   i486_memset,
 355   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 356   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 357   "16",                                 /* Loop alignment.  */
 358   "16",                                 /* Jump alignment.  */
 359   "0:0:8",                              /* Label alignment.  */
 360   "16",                                 /* Func alignment.  */
 361   4,                                    /* Small unroll limit.  */
 362   2,                                    /* Small unroll factor.  */
 363 };
 364
 365 static stringop_algs pentium_memcpy[2] = {
 366   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 367   DUMMY_STRINGOP_ALGS};
 368 static stringop_algs pentium_memset[2] = {
 369   {libcall, {{-1, rep_prefix_4_byte, false}}},
 370   DUMMY_STRINGOP_ALGS};
 371
 372 static const
 373 struct processor_costs pentium_cost = {
 374   {
 375   /* Start of register allocator costs.  integer->integer move cost is 2. */
 376   6,                                 /* cost for loading QImode using movzbl */
 377   {2, 4, 2},                            /* cost of loading integer registers
 378                                            in QImode, HImode and SImode.
 379                                            Relative to reg-reg move (2).  */
 380   {2, 4, 2},                            /* cost of storing integer registers */
 381   2,                                    /* cost of reg,reg fld/fst */
 382   {2, 2, 6},                            /* cost of loading fp registers
 383                                            in SFmode, DFmode and XFmode */
 384   {4, 4, 6},                            /* cost of storing fp registers
 385                                            in SFmode, DFmode and XFmode */
 386   8,                                    /* cost of moving MMX register */
 387   {8, 8},                               /* cost of loading MMX registers
 388                                            in SImode and DImode */
 389   {8, 8},                               /* cost of storing MMX registers
 390                                            in SImode and DImode */
 391   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 392   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 393                                            in 32,64,128,256 and 512-bit */
 394   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 395                                            in 32,64,128,256 and 512-bit */
 396   3, 3,                         /* SSE->integer and integer->SSE moves */
 397   3, 3,                         /* mask->integer and integer->mask moves */
 398   {2, 4, 2},                            /* cost of loading mask register
 399                                            in QImode, HImode, SImode.  */
 400   {2, 4, 2},                            /* cost if storing mask register
 401                                            in QImode, HImode, SImode.  */
 402   2,                                    /* cost of moving mask register.  */
 403   /* End of register allocator costs.  */
 404   },
 405
 406   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 407   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 408   COSTS_N_INSNS (4),                    /* variable shift costs */
 409   COSTS_N_INSNS (1),                    /* constant shift costs */
 410   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 411    COSTS_N_INSNS (11),                  /*                               HI */
 412    COSTS_N_INSNS (11),                  /*                               SI */
 413    COSTS_N_INSNS (11),                  /*                               DI */
 414    COSTS_N_INSNS (11)},                 /*                            other */
 415   0,                                    /* cost of multiply per each bit set */
 416   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 417    COSTS_N_INSNS (25),                  /*                          HI */
 418    COSTS_N_INSNS (25),                  /*                          SI */
 419    COSTS_N_INSNS (25),                  /*                          DI */
 420    COSTS_N_INSNS (25)},                 /*                          other */
 421   COSTS_N_INSNS (3),                    /* cost of movsx */
 422   COSTS_N_INSNS (2),                    /* cost of movzx */
 423   8,                                    /* "large" insn */
 424   6,                                    /* MOVE_RATIO */
 425   6,                                    /* CLEAR_RATIO */
 426   {2, 4, 2},                            /* cost of loading integer registers
 427                                            in QImode, HImode and SImode.
 428                                            Relative to reg-reg move (2).  */
 429   {2, 4, 2},                            /* cost of storing integer registers */
 430   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 431                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 432   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 433                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 434   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 435   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 436   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 437   3,                                    /* cost of moving SSE register to integer.  */
 438   4, 4,                                 /* Gather load static, per_elt.  */
 439   4, 4,                                 /* Gather store static, per_elt.  */
 440   8,                                    /* size of l1 cache.  */
 441   8,                                    /* size of l2 cache  */
 442   0,                                    /* size of prefetch block */
 443   0,                                    /* number of parallel prefetches */
 444   2,                                    /* Branch cost */
 445   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 446   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 447   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 448   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 449   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 450   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 451
 452   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 453   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 454   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
 455   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
 456   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
 457   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
 458   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
 459   COSTS_N_INSNS (39),                   /* cost of DIVSD instruction.  */
 460   COSTS_N_INSNS (70),                   /* cost of SQRTSS instruction.  */
 461   COSTS_N_INSNS (70),                   /* cost of SQRTSD instruction.  */
 462   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 463   pentium_memcpy,
 464   pentium_memset,
 465   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 466   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 467   "16:8:8",                             /* Loop alignment.  */
 468   "16:8:8",                             /* Jump alignment.  */
 469   "0:0:8",                              /* Label alignment.  */
 470   "16",                                 /* Func alignment.  */
 471   4,                                    /* Small unroll limit.  */
 472   2,                                    /* Small unroll factor.  */
 473 };
 474
 475 static const
 476 struct processor_costs lakemont_cost = {
 477   {
 478   /* Start of register allocator costs.  integer->integer move cost is 2. */
 479   6,                                 /* cost for loading QImode using movzbl */
 480   {2, 4, 2},                            /* cost of loading integer registers
 481                                            in QImode, HImode and SImode.
 482                                            Relative to reg-reg move (2).  */
 483   {2, 4, 2},                            /* cost of storing integer registers */
 484   2,                                    /* cost of reg,reg fld/fst */
 485   {2, 2, 6},                            /* cost of loading fp registers
 486                                            in SFmode, DFmode and XFmode */
 487   {4, 4, 6},                            /* cost of storing fp registers
 488                                            in SFmode, DFmode and XFmode */
 489   8,                                    /* cost of moving MMX register */
 490   {8, 8},                               /* cost of loading MMX registers
 491                                            in SImode and DImode */
 492   {8, 8},                               /* cost of storing MMX registers
 493                                            in SImode and DImode */
 494   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 495   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 496                                            in 32,64,128,256 and 512-bit */
 497   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 498                                            in 32,64,128,256 and 512-bit */
 499   3, 3,                         /* SSE->integer and integer->SSE moves */
 500   3, 3,                         /* mask->integer and integer->mask moves */
 501   {2, 4, 2},                            /* cost of loading mask register
 502                                            in QImode, HImode, SImode.  */
 503   {2, 4, 2},                            /* cost if storing mask register
 504                                            in QImode, HImode, SImode.  */
 505   2,                                    /* cost of moving mask register.  */
 506   /* End of register allocator costs.  */
 507   },
 508
 509   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 510   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
 511   COSTS_N_INSNS (1),                    /* variable shift costs */
 512   COSTS_N_INSNS (1),                    /* constant shift costs */
 513   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 514    COSTS_N_INSNS (11),                  /*                               HI */
 515    COSTS_N_INSNS (11),                  /*                               SI */
 516    COSTS_N_INSNS (11),                  /*                               DI */
 517    COSTS_N_INSNS (11)},                 /*                            other */
 518   0,                                    /* cost of multiply per each bit set */
 519   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 520    COSTS_N_INSNS (25),                  /*                          HI */
 521    COSTS_N_INSNS (25),                  /*                          SI */
 522    COSTS_N_INSNS (25),                  /*                          DI */
 523    COSTS_N_INSNS (25)},                 /*                          other */
 524   COSTS_N_INSNS (3),                    /* cost of movsx */
 525   COSTS_N_INSNS (2),                    /* cost of movzx */
 526   8,                                    /* "large" insn */
 527   17,                                   /* MOVE_RATIO */
 528   6,                                    /* CLEAR_RATIO */
 529   {2, 4, 2},                            /* cost of loading integer registers
 530                                            in QImode, HImode and SImode.
 531                                            Relative to reg-reg move (2).  */
 532   {2, 4, 2},                            /* cost of storing integer registers */
 533   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 534                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 535   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 536                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 537   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 538   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 539   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 540   3,                                    /* cost of moving SSE register to integer.  */
 541   4, 4,                                 /* Gather load static, per_elt.  */
 542   4, 4,                                 /* Gather store static, per_elt.  */
 543   8,                                    /* size of l1 cache.  */
 544   8,                                    /* size of l2 cache  */
 545   0,                                    /* size of prefetch block */
 546   0,                                    /* number of parallel prefetches */
 547   2,                                    /* Branch cost */
 548   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 549   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 550   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 551   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 552   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 553   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 554
 555   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 556   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 557   COSTS_N_INSNS (5),                    /* cost of MULSS instruction.  */
 558   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
 559   COSTS_N_INSNS (10),                   /* cost of FMA SS instruction.  */
 560   COSTS_N_INSNS (10),                   /* cost of FMA SD instruction.  */
 561   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
 562   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
 563   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 564   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
 565   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 566   pentium_memcpy,
 567   pentium_memset,
 568   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 569   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 570   "16:8:8",                             /* Loop alignment.  */
 571   "16:8:8",                             /* Jump alignment.  */
 572   "0:0:8",                              /* Label alignment.  */
 573   "16",                                 /* Func alignment.  */
 574   4,                                    /* Small unroll limit.  */
 575   2,                                    /* Small unroll factor.  */
 576 };
 577
 578 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
 579    (we ensure the alignment).  For small blocks inline loop is still a
 580    noticeable win, for bigger blocks either rep movsl or rep movsb is
 581    way to go.  Rep movsb has apparently more expensive startup time in CPU,
 582    but after 4K the difference is down in the noise.  */
 583 static stringop_algs pentiumpro_memcpy[2] = {
 584   {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
 585                        {8192, rep_prefix_4_byte, false},
 586                        {-1, rep_prefix_1_byte, false}}},
 587   DUMMY_STRINGOP_ALGS};
 588 static stringop_algs pentiumpro_memset[2] = {
 589   {rep_prefix_4_byte, {{1024, unrolled_loop, false},
 590                        {8192, rep_prefix_4_byte, false},
 591                        {-1, libcall, false}}},
 592   DUMMY_STRINGOP_ALGS};
 593 static const
 594 struct processor_costs pentiumpro_cost = {
 595   {
 596   /* Start of register allocator costs.  integer->integer move cost is 2. */
 597   2,                                 /* cost for loading QImode using movzbl */
 598   {4, 4, 4},                            /* cost of loading integer registers
 599                                            in QImode, HImode and SImode.
 600                                            Relative to reg-reg move (2).  */
 601   {2, 2, 2},                            /* cost of storing integer registers */
 602   2,                                    /* cost of reg,reg fld/fst */
 603   {2, 2, 6},                            /* cost of loading fp registers
 604                                            in SFmode, DFmode and XFmode */
 605   {4, 4, 6},                            /* cost of storing fp registers
 606                                            in SFmode, DFmode and XFmode */
 607   2,                                    /* cost of moving MMX register */
 608   {2, 2},                               /* cost of loading MMX registers
 609                                            in SImode and DImode */
 610   {2, 2},                               /* cost of storing MMX registers
 611                                            in SImode and DImode */
 612   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 613   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 614                                            in 32,64,128,256 and 512-bit */
 615   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 616                                            in 32,64,128,256 and 512-bit */
 617   3, 3,                         /* SSE->integer and integer->SSE moves */
 618   3, 3,                         /* mask->integer and integer->mask moves */
 619   {4, 4, 4},                            /* cost of loading mask register
 620                                            in QImode, HImode, SImode.  */
 621   {2, 2, 2},                            /* cost if storing mask register
 622                                            in QImode, HImode, SImode.  */
 623   2,                                    /* cost of moving mask register.  */
 624   /* End of register allocator costs.  */
 625   },
 626
 627   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 628   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 629   COSTS_N_INSNS (1),                    /* variable shift costs */
 630   COSTS_N_INSNS (1),                    /* constant shift costs */
 631   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
 632    COSTS_N_INSNS (4),                   /*                               HI */
 633    COSTS_N_INSNS (4),                   /*                               SI */
 634    COSTS_N_INSNS (4),                   /*                               DI */
 635    COSTS_N_INSNS (4)},                  /*                            other */
 636   0,                                    /* cost of multiply per each bit set */
 637   {COSTS_N_INSNS (17),                  /* cost of a divide/mod for QI */
 638    COSTS_N_INSNS (17),                  /*                          HI */
 639    COSTS_N_INSNS (17),                  /*                          SI */
 640    COSTS_N_INSNS (17),                  /*                          DI */
 641    COSTS_N_INSNS (17)},                 /*                          other */
 642   COSTS_N_INSNS (1),                    /* cost of movsx */
 643   COSTS_N_INSNS (1),                    /* cost of movzx */
 644   8,                                    /* "large" insn */
 645   6,                                    /* MOVE_RATIO */
 646   6,                                    /* CLEAR_RATIO */
 647   {4, 4, 4},                            /* cost of loading integer registers
 648                                            in QImode, HImode and SImode.
 649                                            Relative to reg-reg move (2).  */
 650   {2, 2, 2},                            /* cost of storing integer registers */
 651   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 652                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 653   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 654                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 655   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 656   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 657   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 658   3,                                    /* cost of moving SSE register to integer.  */
 659   4, 4,                                 /* Gather load static, per_elt.  */
 660   4, 4,                                 /* Gather store static, per_elt.  */
 661   8,                                    /* size of l1 cache.  */
 662   256,                                  /* size of l2 cache  */
 663   32,                                   /* size of prefetch block */
 664   6,                                    /* number of parallel prefetches */
 665   2,                                    /* Branch cost */
 666   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 667   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
 668   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 669   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 670   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 671   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 672
 673   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 674   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 675   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 676   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 677   COSTS_N_INSNS (7),                    /* cost of FMA SS instruction.  */
 678   COSTS_N_INSNS (7),                    /* cost of FMA SD instruction.  */
 679   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
 680   COSTS_N_INSNS (18),                   /* cost of DIVSD instruction.  */
 681   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 682   COSTS_N_INSNS (31),                   /* cost of SQRTSD instruction.  */
 683   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 684   pentiumpro_memcpy,
 685   pentiumpro_memset,
 686   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 687   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 688   "16",                                 /* Loop alignment.  */
 689   "16:11:8",                            /* Jump alignment.  */
 690   "0:0:8",                              /* Label alignment.  */
 691   "16",                                 /* Func alignment.  */
 692   4,                                    /* Small unroll limit.  */
 693   2,                                    /* Small unroll factor.  */
 694 };
 695
 696 static stringop_algs geode_memcpy[2] = {
 697   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 698   DUMMY_STRINGOP_ALGS};
 699 static stringop_algs geode_memset[2] = {
 700   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 701   DUMMY_STRINGOP_ALGS};
 702 static const
 703 struct processor_costs geode_cost = {
 704   {
 705   /* Start of register allocator costs.  integer->integer move cost is 2. */
 706   2,                                 /* cost for loading QImode using movzbl */
 707   {2, 2, 2},                            /* cost of loading integer registers
 708                                            in QImode, HImode and SImode.
 709                                            Relative to reg-reg move (2).  */
 710   {2, 2, 2},                            /* cost of storing integer registers */
 711   2,                                    /* cost of reg,reg fld/fst */
 712   {2, 2, 2},                            /* cost of loading fp registers
 713                                            in SFmode, DFmode and XFmode */
 714   {4, 6, 6},                            /* cost of storing fp registers
 715                                            in SFmode, DFmode and XFmode */
 716   2,                                    /* cost of moving MMX register */
 717   {2, 2},                               /* cost of loading MMX registers
 718                                            in SImode and DImode */
 719   {2, 2},                               /* cost of storing MMX registers
 720                                            in SImode and DImode */
 721   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 722   {2, 2, 8, 16, 32},                    /* cost of loading SSE registers
 723                                            in 32,64,128,256 and 512-bit */
 724   {2, 2, 8, 16, 32},                    /* cost of storing SSE registers
 725                                            in 32,64,128,256 and 512-bit */
 726   6, 6,                         /* SSE->integer and integer->SSE moves */
 727   6, 6,                         /* mask->integer and integer->mask moves */
 728   {2, 2, 2},                            /* cost of loading mask register
 729                                            in QImode, HImode, SImode.  */
 730   {2, 2, 2},                            /* cost if storing mask register
 731                                            in QImode, HImode, SImode.  */
 732   2,                                    /* cost of moving mask register.  */
 733   /* End of register allocator costs.  */
 734   },
 735
 736   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 737   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 738   COSTS_N_INSNS (2),                    /* variable shift costs */
 739   COSTS_N_INSNS (1),                    /* constant shift costs */
 740   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 741    COSTS_N_INSNS (4),                   /*                               HI */
 742    COSTS_N_INSNS (7),                   /*                               SI */
 743    COSTS_N_INSNS (7),                   /*                               DI */
 744    COSTS_N_INSNS (7)},                  /*                            other */
 745   0,                                    /* cost of multiply per each bit set */
 746   {COSTS_N_INSNS (15),                  /* cost of a divide/mod for QI */
 747    COSTS_N_INSNS (23),                  /*                          HI */
 748    COSTS_N_INSNS (39),                  /*                          SI */
 749    COSTS_N_INSNS (39),                  /*                          DI */
 750    COSTS_N_INSNS (39)},                 /*                          other */
 751   COSTS_N_INSNS (1),                    /* cost of movsx */
 752   COSTS_N_INSNS (1),                    /* cost of movzx */
 753   8,                                    /* "large" insn */
 754   4,                                    /* MOVE_RATIO */
 755   4,                                    /* CLEAR_RATIO */
 756   {2, 2, 2},                            /* cost of loading integer registers
 757                                            in QImode, HImode and SImode.
 758                                            Relative to reg-reg move (2).  */
 759   {2, 2, 2},                            /* cost of storing integer registers */
 760   {2, 2, 8, 16, 32},                    /* cost of loading SSE register
 761                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 762   {2, 2, 8, 16, 32},                    /* cost of storing SSE register
 763                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 764   {2, 2, 8, 16, 32},                    /* cost of unaligned loads.  */
 765   {2, 2, 8, 16, 32},                    /* cost of unaligned stores.  */
 766   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 767   6,                                    /* cost of moving SSE register to integer.  */
 768   2, 2,                                 /* Gather load static, per_elt.  */
 769   2, 2,                                 /* Gather store static, per_elt.  */
 770   64,                                   /* size of l1 cache.  */
 771   128,                                  /* size of l2 cache.  */
 772   32,                                   /* size of prefetch block */
 773   1,                                    /* number of parallel prefetches */
 774   1,                                    /* Branch cost */
 775   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
 776   COSTS_N_INSNS (11),                   /* cost of FMUL instruction.  */
 777   COSTS_N_INSNS (47),                   /* cost of FDIV instruction.  */
 778   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 779   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 780   COSTS_N_INSNS (54),                   /* cost of FSQRT instruction.  */
 781
 782   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 783   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 784   COSTS_N_INSNS (11),                   /* cost of MULSS instruction.  */
 785   COSTS_N_INSNS (11),                   /* cost of MULSD instruction.  */
 786   COSTS_N_INSNS (17),                   /* cost of FMA SS instruction.  */
 787   COSTS_N_INSNS (17),                   /* cost of FMA SD instruction.  */
 788   COSTS_N_INSNS (47),                   /* cost of DIVSS instruction.  */
 789   COSTS_N_INSNS (47),                   /* cost of DIVSD instruction.  */
 790   COSTS_N_INSNS (54),                   /* cost of SQRTSS instruction.  */
 791   COSTS_N_INSNS (54),                   /* cost of SQRTSD instruction.  */
 792   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 793   geode_memcpy,
 794   geode_memset,
 795   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 796   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 797   NULL,                                 /* Loop alignment.  */
 798   NULL,                                 /* Jump alignment.  */
 799   NULL,                                 /* Label alignment.  */
 800   NULL,                                 /* Func alignment.  */
 801   4,                                    /* Small unroll limit.  */
 802   2,                                    /* Small unroll factor.  */
 803 };
 804
 805 static stringop_algs k6_memcpy[2] = {
 806   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 807   DUMMY_STRINGOP_ALGS};
 808 static stringop_algs k6_memset[2] = {
 809   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 810   DUMMY_STRINGOP_ALGS};
 811 static const
 812 struct processor_costs k6_cost = {
 813   {
 814   /* Start of register allocator costs.  integer->integer move cost is 2. */
 815   3,                                 /* cost for loading QImode using movzbl */
 816   {4, 5, 4},                            /* cost of loading integer registers
 817                                            in QImode, HImode and SImode.
 818                                            Relative to reg-reg move (2).  */
 819   {2, 3, 2},                            /* cost of storing integer registers */
 820   4,                                    /* cost of reg,reg fld/fst */
 821   {6, 6, 6},                            /* cost of loading fp registers
 822                                            in SFmode, DFmode and XFmode */
 823   {4, 4, 4},                            /* cost of storing fp registers
 824                                            in SFmode, DFmode and XFmode */
 825   2,                                    /* cost of moving MMX register */
 826   {2, 2},                               /* cost of loading MMX registers
 827                                            in SImode and DImode */
 828   {2, 2},                               /* cost of storing MMX registers
 829                                            in SImode and DImode */
 830   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 831   {2, 2, 8, 16, 32},                    /* cost of loading SSE registers
 832                                            in 32,64,128,256 and 512-bit */
 833   {2, 2, 8, 16, 32},                    /* cost of storing SSE registers
 834                                            in 32,64,128,256 and 512-bit */
 835   6, 6,                         /* SSE->integer and integer->SSE moves */
 836   6, 6,                         /* mask->integer and integer->mask moves */
 837   {4, 5, 4},                            /* cost of loading mask register
 838                                            in QImode, HImode, SImode.  */
 839   {2, 3, 2},                            /* cost if storing mask register
 840                                            in QImode, HImode, SImode.  */
 841   2,                                    /* cost of moving mask register.  */
 842   /* End of register allocator costs.  */
 843   },
 844
 845   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 846   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 847   COSTS_N_INSNS (1),                    /* variable shift costs */
 848   COSTS_N_INSNS (1),                    /* constant shift costs */
 849   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 850    COSTS_N_INSNS (3),                   /*                               HI */
 851    COSTS_N_INSNS (3),                   /*                               SI */
 852    COSTS_N_INSNS (3),                   /*                               DI */
 853    COSTS_N_INSNS (3)},                  /*                            other */
 854   0,                                    /* cost of multiply per each bit set */
 855   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 856    COSTS_N_INSNS (18),                  /*                          HI */
 857    COSTS_N_INSNS (18),                  /*                          SI */
 858    COSTS_N_INSNS (18),                  /*                          DI */
 859    COSTS_N_INSNS (18)},                 /*                          other */
 860   COSTS_N_INSNS (2),                    /* cost of movsx */
 861   COSTS_N_INSNS (2),                    /* cost of movzx */
 862   8,                                    /* "large" insn */
 863   4,                                    /* MOVE_RATIO */
 864   4,                                    /* CLEAR_RATIO */
 865   {4, 5, 4},                            /* cost of loading integer registers
 866                                            in QImode, HImode and SImode.
 867                                            Relative to reg-reg move (2).  */
 868   {2, 3, 2},                            /* cost of storing integer registers */
 869   {2, 2, 8, 16, 32},                    /* cost of loading SSE register
 870                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 871   {2, 2, 8, 16, 32},                    /* cost of storing SSE register
 872                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 873   {2, 2, 8, 16, 32},                    /* cost of unaligned loads.  */
 874   {2, 2, 8, 16, 32},                    /* cost of unaligned stores.  */
 875   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 876   6,                                    /* cost of moving SSE register to integer.  */
 877   2, 2,                                 /* Gather load static, per_elt.  */
 878   2, 2,                                 /* Gather store static, per_elt.  */
 879   32,                                   /* size of l1 cache.  */
 880   32,                                   /* size of l2 cache.  Some models
 881                                            have integrated l2 cache, but
 882                                            optimizing for k6 is not important
 883                                            enough to worry about that.  */
 884   32,                                   /* size of prefetch block */
 885   1,                                    /* number of parallel prefetches */
 886   1,                                    /* Branch cost */
 887   COSTS_N_INSNS (2),                    /* cost of FADD and FSUB insns.  */
 888   COSTS_N_INSNS (2),                    /* cost of FMUL instruction.  */
 889   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 890   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 891   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 892   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 893
 894   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 895   COSTS_N_INSNS (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 896   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
 897   COSTS_N_INSNS (2),                    /* cost of MULSD instruction.  */
 898   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
 899   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
 900   COSTS_N_INSNS (56),                   /* cost of DIVSS instruction.  */
 901   COSTS_N_INSNS (56),                   /* cost of DIVSD instruction.  */
 902   COSTS_N_INSNS (56),                   /* cost of SQRTSS instruction.  */
 903   COSTS_N_INSNS (56),                   /* cost of SQRTSD instruction.  */
 904   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 905   k6_memcpy,
 906   k6_memset,
 907   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 908   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 909   "32:8:8",                             /* Loop alignment.  */
 910   "32:8:8",                             /* Jump alignment.  */
 911   "0:0:8",                              /* Label alignment.  */
 912   "32",                                 /* Func alignment.  */
 913   4,                                    /* Small unroll limit.  */
 914   2,                                    /* Small unroll factor.  */
 915 };
 916
 917 /* For some reason, Athlon deals better with REP prefix (relative to loops)
 918    compared to K8. Alignment becomes important after 8 bytes for memcpy and
 919    128 bytes for memset.  */
 920 static stringop_algs athlon_memcpy[2] = {
 921   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 922   DUMMY_STRINGOP_ALGS};
 923 static stringop_algs athlon_memset[2] = {
 924   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 925   DUMMY_STRINGOP_ALGS};
 926 static const
 927 struct processor_costs athlon_cost = {
 928   {
 929   /* Start of register allocator costs.  integer->integer move cost is 2. */
 930   4,                                 /* cost for loading QImode using movzbl */
 931   {3, 4, 3},                            /* cost of loading integer registers
 932                                            in QImode, HImode and SImode.
 933                                            Relative to reg-reg move (2).  */
 934   {3, 4, 3},                            /* cost of storing integer registers */
 935   4,                                    /* cost of reg,reg fld/fst */
 936   {4, 4, 12},                           /* cost of loading fp registers
 937                                            in SFmode, DFmode and XFmode */
 938   {6, 6, 8},                            /* cost of storing fp registers
 939                                            in SFmode, DFmode and XFmode */
 940   2,                                    /* cost of moving MMX register */
 941   {4, 4},                               /* cost of loading MMX registers
 942                                            in SImode and DImode */
 943   {4, 4},                               /* cost of storing MMX registers
 944                                            in SImode and DImode */
 945   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 946   {4, 4, 12, 12, 24},                   /* cost of loading SSE registers
 947                                            in 32,64,128,256 and 512-bit */
 948   {4, 4, 10, 10, 20},                   /* cost of storing SSE registers
 949                                            in 32,64,128,256 and 512-bit */
 950   5, 5,                         /* SSE->integer and integer->SSE moves */
 951   5, 5,                         /* mask->integer and integer->mask moves */
 952   {3, 4, 3},                            /* cost of loading mask register
 953                                            in QImode, HImode, SImode.  */
 954   {3, 4, 3},                            /* cost if storing mask register
 955                                            in QImode, HImode, SImode.  */
 956   2,                                    /* cost of moving mask register.  */
 957   /* End of register allocator costs.  */
 958   },
 959
 960   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 961   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 962   COSTS_N_INSNS (1),                    /* variable shift costs */
 963   COSTS_N_INSNS (1),                    /* constant shift costs */
 964   {COSTS_N_INSNS (5),                   /* cost of starting multiply for QI */
 965    COSTS_N_INSNS (5),                   /*                               HI */
 966    COSTS_N_INSNS (5),                   /*                               SI */
 967    COSTS_N_INSNS (5),                   /*                               DI */
 968    COSTS_N_INSNS (5)},                  /*                            other */
 969   0,                                    /* cost of multiply per each bit set */
 970   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 971    COSTS_N_INSNS (26),                  /*                          HI */
 972    COSTS_N_INSNS (42),                  /*                          SI */
 973    COSTS_N_INSNS (74),                  /*                          DI */
 974    COSTS_N_INSNS (74)},                 /*                          other */
 975   COSTS_N_INSNS (1),                    /* cost of movsx */
 976   COSTS_N_INSNS (1),                    /* cost of movzx */
 977   8,                                    /* "large" insn */
 978   9,                                    /* MOVE_RATIO */
 979   6,                                    /* CLEAR_RATIO */
 980   {3, 4, 3},                            /* cost of loading integer registers
 981                                            in QImode, HImode and SImode.
 982                                            Relative to reg-reg move (2).  */
 983   {3, 4, 3},                            /* cost of storing integer registers */
 984   {4, 4, 12, 12, 24},                   /* cost of loading SSE register
 985                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 986   {4, 4, 10, 10, 20},                   /* cost of storing SSE register
 987                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 988   {4, 4, 12, 12, 24},                   /* cost of unaligned loads.  */
 989   {4, 4, 10, 10, 20},                   /* cost of unaligned stores.  */
 990   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 991   5,                                    /* cost of moving SSE register to integer.  */
 992   4, 4,                                 /* Gather load static, per_elt.  */
 993   4, 4,                                 /* Gather store static, per_elt.  */
 994   64,                                   /* size of l1 cache.  */
 995   256,                                  /* size of l2 cache.  */
 996   64,                                   /* size of prefetch block */
 997   6,                                    /* number of parallel prefetches */
 998   5,                                    /* Branch cost */
 999   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1000   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1001   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
1002   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1003   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1004   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1005
1006   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1007   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1008   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1009   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1010   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
1011   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
1012   /* 11-16  */
1013   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
1014   COSTS_N_INSNS (24),                   /* cost of DIVSD instruction.  */
1015   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
1016   COSTS_N_INSNS (19),                   /* cost of SQRTSD instruction.  */
1017   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1018   athlon_memcpy,
1019   athlon_memset,
1020   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1021   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1022   "16:8:8",                             /* Loop alignment.  */
1023   "16:8:8",                             /* Jump alignment.  */
1024   "0:0:8",                              /* Label alignment.  */
1025   "16",                                 /* Func alignment.  */
1026   4,                                    /* Small unroll limit.  */
1027   2,                                    /* Small unroll factor.  */
1028 };
1029
1030 /* K8 has optimized REP instruction for medium sized blocks, but for very
1031    small blocks it is better to use loop. For large blocks, libcall can
1032    do nontemporary accesses and beat inline considerably.  */
1033 static stringop_algs k8_memcpy[2] = {
1034   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1035              {-1, rep_prefix_4_byte, false}}},
1036   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1037              {-1, libcall, false}}}};
1038 static stringop_algs k8_memset[2] = {
1039   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1040              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1041   {libcall, {{48, unrolled_loop, false},
1042              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1043 static const
1044 struct processor_costs k8_cost = {
1045   {
1046   /* Start of register allocator costs.  integer->integer move cost is 2. */
1047   4,                                 /* cost for loading QImode using movzbl */
1048   {3, 4, 3},                            /* cost of loading integer registers
1049                                            in QImode, HImode and SImode.
1050                                            Relative to reg-reg move (2).  */
1051   {3, 4, 3},                            /* cost of storing integer registers */
1052   4,                                    /* cost of reg,reg fld/fst */
1053   {4, 4, 12},                           /* cost of loading fp registers
1054                                            in SFmode, DFmode and XFmode */
1055   {6, 6, 8},                            /* cost of storing fp registers
1056                                            in SFmode, DFmode and XFmode */
1057   2,                                    /* cost of moving MMX register */
1058   {3, 3},                               /* cost of loading MMX registers
1059                                            in SImode and DImode */
1060   {4, 4},                               /* cost of storing MMX registers
1061                                            in SImode and DImode */
1062   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1063   {4, 3, 12, 12, 24},                   /* cost of loading SSE registers
1064                                            in 32,64,128,256 and 512-bit */
1065   {4, 4, 10, 10, 20},                   /* cost of storing SSE registers
1066                                            in 32,64,128,256 and 512-bit */
1067   5, 5,                         /* SSE->integer and integer->SSE moves */
1068   5, 5,                         /* mask->integer and integer->mask moves */
1069   {3, 4, 3},                            /* cost of loading mask register
1070                                            in QImode, HImode, SImode.  */
1071   {3, 4, 3},                            /* cost if storing mask register
1072                                            in QImode, HImode, SImode.  */
1073   2,                                    /* cost of moving mask register.  */
1074   /* End of register allocator costs.  */
1075   },
1076
1077   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1078   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1079   COSTS_N_INSNS (1),                    /* variable shift costs */
1080   COSTS_N_INSNS (1),                    /* constant shift costs */
1081   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1082    COSTS_N_INSNS (4),                   /*                               HI */
1083    COSTS_N_INSNS (3),                   /*                               SI */
1084    COSTS_N_INSNS (4),                   /*                               DI */
1085    COSTS_N_INSNS (5)},                  /*                            other */
1086   0,                                    /* cost of multiply per each bit set */
1087   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
1088    COSTS_N_INSNS (26),                  /*                          HI */
1089    COSTS_N_INSNS (42),                  /*                          SI */
1090    COSTS_N_INSNS (74),                  /*                          DI */
1091    COSTS_N_INSNS (74)},                 /*                          other */
1092   COSTS_N_INSNS (1),                    /* cost of movsx */
1093   COSTS_N_INSNS (1),                    /* cost of movzx */
1094   8,                                    /* "large" insn */
1095   9,                                    /* MOVE_RATIO */
1096   6,                                    /* CLEAR_RATIO */
1097   {3, 4, 3},                            /* cost of loading integer registers
1098                                            in QImode, HImode and SImode.
1099                                            Relative to reg-reg move (2).  */
1100   {3, 4, 3},                            /* cost of storing integer registers */
1101   {4, 3, 12, 12, 24},                   /* cost of loading SSE register
1102                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1103   {4, 4, 10, 10, 20},                   /* cost of storing SSE register
1104                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1105   {4, 3, 12, 12, 24},                   /* cost of unaligned loads.  */
1106   {4, 4, 10, 10, 20},                   /* cost of unaligned stores.  */
1107   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1108   5,                                    /* cost of moving SSE register to integer.  */
1109   4, 4,                                 /* Gather load static, per_elt.  */
1110   4, 4,                                 /* Gather store static, per_elt.  */
1111   64,                                   /* size of l1 cache.  */
1112   512,                                  /* size of l2 cache.  */
1113   64,                                   /* size of prefetch block */
1114   /* New AMD processors never drop prefetches; if they cannot be performed
1115      immediately, they are queued.  We set number of simultaneous prefetches
1116      to a large constant to reflect this (it probably is not a good idea not
1117      to limit number of prefetches at all, as their execution also takes some
1118      time).  */
1119   100,                                  /* number of parallel prefetches */
1120   3,                                    /* Branch cost */
1121   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1122   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1123   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1124   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1125   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1126   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1127
1128   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1129   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1130   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1131   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1132   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
1133   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
1134   /* 11-16  */
1135   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
1136   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
1137   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
1138   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
1139   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1140   k8_memcpy,
1141   k8_memset,
1142   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1143   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1144   "16:8:8",                             /* Loop alignment.  */
1145   "16:8:8",                             /* Jump alignment.  */
1146   "0:0:8",                              /* Label alignment.  */
1147   "16",                                 /* Func alignment.  */
1148   4,                                    /* Small unroll limit.  */
1149   2,                                    /* Small unroll factor.  */
1150 };
1151
1152 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1153    very small blocks it is better to use loop. For large blocks, libcall can
1154    do nontemporary accesses and beat inline considerably.  */
1155 static stringop_algs amdfam10_memcpy[2] = {
1156   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1157              {-1, rep_prefix_4_byte, false}}},
1158   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1159              {-1, libcall, false}}}};
1160 static stringop_algs amdfam10_memset[2] = {
1161   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1162              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1163   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1164              {-1, libcall, false}}}};
1165 struct processor_costs amdfam10_cost = {
1166   {
1167   /* Start of register allocator costs.  integer->integer move cost is 2. */
1168   4,                                 /* cost for loading QImode using movzbl */
1169   {3, 4, 3},                            /* cost of loading integer registers
1170                                            in QImode, HImode and SImode.
1171                                            Relative to reg-reg move (2).  */
1172   {3, 4, 3},                            /* cost of storing integer registers */
1173   4,                                    /* cost of reg,reg fld/fst */
1174   {4, 4, 12},                           /* cost of loading fp registers
1175                                            in SFmode, DFmode and XFmode */
1176   {6, 6, 8},                            /* cost of storing fp registers
1177                                            in SFmode, DFmode and XFmode */
1178   2,                                    /* cost of moving MMX register */
1179   {3, 3},                               /* cost of loading MMX registers
1180                                            in SImode and DImode */
1181   {4, 4},                               /* cost of storing MMX registers
1182                                            in SImode and DImode */
1183   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1184   {4, 4, 3, 6, 12},                     /* cost of loading SSE registers
1185                                            in 32,64,128,256 and 512-bit */
1186   {4, 4, 5, 10, 20},                    /* cost of storing SSE registers
1187                                            in 32,64,128,256 and 512-bit */
1188   3, 3,                         /* SSE->integer and integer->SSE moves */
1189   3, 3,                         /* mask->integer and integer->mask moves */
1190   {3, 4, 3},                            /* cost of loading mask register
1191                                            in QImode, HImode, SImode.  */
1192   {3, 4, 3},                            /* cost if storing mask register
1193                                            in QImode, HImode, SImode.  */
1194   2,                                    /* cost of moving mask register.  */
1195
1196                                         /* On K8:
1197                                             MOVD reg64, xmmreg Double FSTORE 4
1198                                             MOVD reg32, xmmreg Double FSTORE 4
1199                                            On AMDFAM10:
1200                                             MOVD reg64, xmmreg Double FADD 3
1201                                                                1/1  1/1
1202                                             MOVD reg32, xmmreg Double FADD 3
1203                                                                1/1  1/1 */
1204   /* End of register allocator costs.  */
1205   },
1206
1207   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1208   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1209   COSTS_N_INSNS (1),                    /* variable shift costs */
1210   COSTS_N_INSNS (1),                    /* constant shift costs */
1211   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1212    COSTS_N_INSNS (4),                   /*                               HI */
1213    COSTS_N_INSNS (3),                   /*                               SI */
1214    COSTS_N_INSNS (4),                   /*                               DI */
1215    COSTS_N_INSNS (5)},                  /*                            other */
1216   0,                                    /* cost of multiply per each bit set */
1217   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1218    COSTS_N_INSNS (35),                  /*                          HI */
1219    COSTS_N_INSNS (51),                  /*                          SI */
1220    COSTS_N_INSNS (83),                  /*                          DI */
1221    COSTS_N_INSNS (83)},                 /*                          other */
1222   COSTS_N_INSNS (1),                    /* cost of movsx */
1223   COSTS_N_INSNS (1),                    /* cost of movzx */
1224   8,                                    /* "large" insn */
1225   9,                                    /* MOVE_RATIO */
1226   6,                                    /* CLEAR_RATIO */
1227   {3, 4, 3},                            /* cost of loading integer registers
1228                                            in QImode, HImode and SImode.
1229                                            Relative to reg-reg move (2).  */
1230   {3, 4, 3},                            /* cost of storing integer registers */
1231   {4, 4, 3, 6, 12},                     /* cost of loading SSE register
1232                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1233   {4, 4, 5, 10, 20},                    /* cost of storing SSE register
1234                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1235   {4, 4, 3, 7, 12},                     /* cost of unaligned loads.  */
1236   {4, 4, 5, 10, 20},                    /* cost of unaligned stores.  */
1237   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1238   3,                                    /* cost of moving SSE register to integer.  */
1239   4, 4,                                 /* Gather load static, per_elt.  */
1240   4, 4,                                 /* Gather store static, per_elt.  */
1241   64,                                   /* size of l1 cache.  */
1242   512,                                  /* size of l2 cache.  */
1243   64,                                   /* size of prefetch block */
1244   /* New AMD processors never drop prefetches; if they cannot be performed
1245      immediately, they are queued.  We set number of simultaneous prefetches
1246      to a large constant to reflect this (it probably is not a good idea not
1247      to limit number of prefetches at all, as their execution also takes some
1248      time).  */
1249   100,                                  /* number of parallel prefetches */
1250   2,                                    /* Branch cost */
1251   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1252   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1253   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1254   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1255   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1256   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1257
1258   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1259   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1260   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1261   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1262   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
1263   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
1264   /* 11-16  */
1265   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
1266   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
1267   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
1268   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
1269   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1270   amdfam10_memcpy,
1271   amdfam10_memset,
1272   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1273   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1274   "32:25:8",                            /* Loop alignment.  */
1275   "32:8:8",                             /* Jump alignment.  */
1276   "0:0:8",                              /* Label alignment.  */
1277   "32",                                 /* Func alignment.  */
1278   4,                                    /* Small unroll limit.  */
1279   2,                                    /* Small unroll factor.  */
1280 };
1281
1282 /*  BDVER has optimized REP instruction for medium sized blocks, but for
1283     very small blocks it is better to use loop. For large blocks, libcall
1284     can do nontemporary accesses and beat inline considerably.  */
1285 static stringop_algs bdver_memcpy[2] = {
1286   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1287              {-1, rep_prefix_4_byte, false}}},
1288   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1289              {-1, libcall, false}}}};
1290 static stringop_algs bdver_memset[2] = {
1291   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1292              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1293   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1294              {-1, libcall, false}}}};
1295
1296 const struct processor_costs bdver_cost = {
1297   {
1298   /* Start of register allocator costs.  integer->integer move cost is 2. */
1299   8,                                 /* cost for loading QImode using movzbl */
1300   {8, 8, 8},                            /* cost of loading integer registers
1301                                            in QImode, HImode and SImode.
1302                                            Relative to reg-reg move (2).  */
1303   {8, 8, 8},                            /* cost of storing integer registers */
1304   4,                                    /* cost of reg,reg fld/fst */
1305   {12, 12, 28},                         /* cost of loading fp registers
1306                                            in SFmode, DFmode and XFmode */
1307   {10, 10, 18},                         /* cost of storing fp registers
1308                                            in SFmode, DFmode and XFmode */
1309   4,                                    /* cost of moving MMX register */
1310   {12, 12},                             /* cost of loading MMX registers
1311                                            in SImode and DImode */
1312   {10, 10},                             /* cost of storing MMX registers
1313                                            in SImode and DImode */
1314   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1315   {12, 12, 10, 40, 60},                 /* cost of loading SSE registers
1316                                            in 32,64,128,256 and 512-bit */
1317   {10, 10, 10, 40, 60},                 /* cost of storing SSE registers
1318                                            in 32,64,128,256 and 512-bit */
1319   16, 20,                               /* SSE->integer and integer->SSE moves */
1320   16, 20,                               /* mask->integer and integer->mask moves */
1321   {8, 8, 8},                            /* cost of loading mask register
1322                                            in QImode, HImode, SImode.  */
1323   {8, 8, 8},                            /* cost if storing mask register
1324                                            in QImode, HImode, SImode.  */
1325   2,                                    /* cost of moving mask register.  */
1326   /* End of register allocator costs.  */
1327   },
1328
1329   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1330   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1331   COSTS_N_INSNS (1),                    /* variable shift costs */
1332   COSTS_N_INSNS (1),                    /* constant shift costs */
1333   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1334    COSTS_N_INSNS (4),                   /*                               HI */
1335    COSTS_N_INSNS (4),                   /*                               SI */
1336    COSTS_N_INSNS (6),                   /*                               DI */
1337    COSTS_N_INSNS (6)},                  /*                            other */
1338   0,                                    /* cost of multiply per each bit set */
1339   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1340    COSTS_N_INSNS (35),                  /*                          HI */
1341    COSTS_N_INSNS (51),                  /*                          SI */
1342    COSTS_N_INSNS (83),                  /*                          DI */
1343    COSTS_N_INSNS (83)},                 /*                          other */
1344   COSTS_N_INSNS (1),                    /* cost of movsx */
1345   COSTS_N_INSNS (1),                    /* cost of movzx */
1346   8,                                    /* "large" insn */
1347   9,                                    /* MOVE_RATIO */
1348   6,                                    /* CLEAR_RATIO */
1349   {8, 8, 8},                            /* cost of loading integer registers
1350                                            in QImode, HImode and SImode.
1351                                            Relative to reg-reg move (2).  */
1352   {8, 8, 8},                            /* cost of storing integer registers */
1353   {12, 12, 10, 40, 60},                 /* cost of loading SSE register
1354                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1355   {10, 10, 10, 40, 60},                 /* cost of storing SSE register
1356                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1357   {12, 12, 10, 40, 60},                 /* cost of unaligned loads.  */
1358   {10, 10, 10, 40, 60},                 /* cost of unaligned stores.  */
1359   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1360   16,                                   /* cost of moving SSE register to integer.  */
1361   12, 12,                               /* Gather load static, per_elt.  */
1362   10, 10,                               /* Gather store static, per_elt.  */
1363   16,                                   /* size of l1 cache.  */
1364   2048,                                 /* size of l2 cache.  */
1365   64,                                   /* size of prefetch block */
1366   /* New AMD processors never drop prefetches; if they cannot be performed
1367      immediately, they are queued.  We set number of simultaneous prefetches
1368      to a large constant to reflect this (it probably is not a good idea not
1369      to limit number of prefetches at all, as their execution also takes some
1370      time).  */
1371   100,                                  /* number of parallel prefetches */
1372   2,                                    /* Branch cost */
1373   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1374   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1375   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1376   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1377   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1378   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1379
1380   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1381   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1382   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1383   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1384   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1385   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1386   /* 9-24  */
1387   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1388   /* 9-27  */
1389   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1390   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1391   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1392   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1393   bdver_memcpy,
1394   bdver_memset,
1395   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1396   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1397   "16:11:8",                            /* Loop alignment.  */
1398   "16:8:8",                             /* Jump alignment.  */
1399   "0:0:8",                              /* Label alignment.  */
1400   "11",                                 /* Func alignment.  */
1401   4,                                    /* Small unroll limit.  */
1402   2,                                    /* Small unroll factor.  */
1403 };
1404
1405
1406 /*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
1407     very small blocks it is better to use loop.  For large blocks, libcall
1408     can do nontemporary accesses and beat inline considerably.  */
1409 static stringop_algs znver1_memcpy[2] = {
1410   /* 32-bit tuning.  */
1411   {libcall, {{6, loop, false},
1412              {14, unrolled_loop, false},
1413              {-1, libcall, false}}},
1414   /* 64-bit tuning.  */
1415   {libcall, {{16, loop, false},
1416              {128, rep_prefix_8_byte, false},
1417              {-1, libcall, false}}}};
1418 static stringop_algs znver1_memset[2] = {
1419   /* 32-bit tuning.  */
1420   {libcall, {{8, loop, false},
1421              {24, unrolled_loop, false},
1422              {128, rep_prefix_4_byte, false},
1423              {-1, libcall, false}}},
1424   /* 64-bit tuning.  */
1425   {libcall, {{48, unrolled_loop, false},
1426              {128, rep_prefix_8_byte, false},
1427              {-1, libcall, false}}}};
1428 struct processor_costs znver1_cost = {
1429   {
1430   /* Start of register allocator costs.  integer->integer move cost is 2. */
1431
1432   /* reg-reg moves are done by renaming and thus they are even cheaper than
1433      1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1434      to doubles of latencies, we do not model this correctly.  It does not
1435      seem to make practical difference to bump prices up even more.  */
1436   6,                                    /* cost for loading QImode using
1437                                            movzbl.  */
1438   {6, 6, 6},                            /* cost of loading integer registers
1439                                            in QImode, HImode and SImode.
1440                                            Relative to reg-reg move (2).  */
1441   {8, 8, 8},                            /* cost of storing integer
1442                                            registers.  */
1443   2,                                    /* cost of reg,reg fld/fst.  */
1444   {6, 6, 16},                           /* cost of loading fp registers
1445                                            in SFmode, DFmode and XFmode.  */
1446   {8, 8, 16},                           /* cost of storing fp registers
1447                                            in SFmode, DFmode and XFmode.  */
1448   2,                                    /* cost of moving MMX register.  */
1449   {6, 6},                               /* cost of loading MMX registers
1450                                            in SImode and DImode.  */
1451   {8, 8},                               /* cost of storing MMX registers
1452                                            in SImode and DImode.  */
1453   2, 3, 6,                              /* cost of moving XMM,YMM,ZMM register.  */
1454   {6, 6, 6, 12, 24},                    /* cost of loading SSE registers
1455                                            in 32,64,128,256 and 512-bit.  */
1456   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
1457                                            in 32,64,128,256 and 512-bit.  */
1458   6, 6,                         /* SSE->integer and integer->SSE moves.  */
1459   8, 8,                         /* mask->integer and integer->mask moves */
1460   {6, 6, 6},                            /* cost of loading mask register
1461                                            in QImode, HImode, SImode.  */
1462   {8, 8, 8},                            /* cost if storing mask register
1463                                            in QImode, HImode, SImode.  */
1464   2,                                    /* cost of moving mask register.  */
1465   /* End of register allocator costs.  */
1466   },
1467
1468   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1469   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1470   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1471   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1472   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1473    COSTS_N_INSNS (3),                   /*                               HI.  */
1474    COSTS_N_INSNS (3),                   /*                               SI.  */
1475    COSTS_N_INSNS (3),                   /*                               DI.  */
1476    COSTS_N_INSNS (3)},                  /*                            other.  */
1477   0,                                    /* cost of multiply per each bit
1478                                             set.  */
1479    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1480       bound.  */
1481   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI.  */
1482    COSTS_N_INSNS (22),                  /*                          HI.  */
1483    COSTS_N_INSNS (30),                  /*                          SI.  */
1484    COSTS_N_INSNS (45),                  /*                          DI.  */
1485    COSTS_N_INSNS (45)},                 /*                          other.  */
1486   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1487   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1488   8,                                    /* "large" insn.  */
1489   9,                                    /* MOVE_RATIO.  */
1490   6,                                    /* CLEAR_RATIO */
1491   {6, 6, 6},                            /* cost of loading integer registers
1492                                            in QImode, HImode and SImode.
1493                                            Relative to reg-reg move (2).  */
1494   {8, 8, 8},                            /* cost of storing integer
1495                                            registers.  */
1496   {6, 6, 6, 12, 24},                    /* cost of loading SSE register
1497                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1498   {8, 8, 8, 16, 32},                    /* cost of storing SSE register
1499                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1500   {6, 6, 6, 12, 24},                    /* cost of unaligned loads.  */
1501   {8, 8, 8, 16, 32},                    /* cost of unaligned stores.  */
1502   2, 3, 6,                              /* cost of moving XMM,YMM,ZMM register.  */
1503   6,                                    /* cost of moving SSE register to integer.  */
1504   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1505      throughput 12.  Approx 9 uops do not depend on vector size and every load
1506      is 7 uops.  */
1507   18, 8,                                /* Gather load static, per_elt.  */
1508   18, 10,                               /* Gather store static, per_elt.  */
1509   32,                                   /* size of l1 cache.  */
1510   512,                                  /* size of l2 cache.  */
1511   64,                                   /* size of prefetch block.  */
1512   /* New AMD processors never drop prefetches; if they cannot be performed
1513      immediately, they are queued.  We set number of simultaneous prefetches
1514      to a large constant to reflect this (it probably is not a good idea not
1515      to limit number of prefetches at all, as their execution also takes some
1516      time).  */
1517   100,                                  /* number of parallel prefetches.  */
1518   3,                                    /* Branch cost.  */
1519   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1520   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1521   /* Latency of fdiv is 8-15.  */
1522   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1523   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1524   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1525   /* Latency of fsqrt is 4-10.  */
1526   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1527
1528   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1529   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1530   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1531   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1532   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1533   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1534   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1535   /* 9-13  */
1536   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1537   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1538   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1539   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1540      and it can execute 2 integer additions and 2 multiplications thus
1541      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
1542      that 4 works better than 6 probably due to register pressure.
1543
1544      Integer vector operations are taken by FP unit and execute 3 vector
1545      plus/minus operations per cycle but only one multiply.  This is adjusted
1546      in ix86_reassociation_width.  */
1547   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1548   znver1_memcpy,
1549   znver1_memset,
1550   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1551   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1552   "16",                                 /* Loop alignment.  */
1553   "16",                                 /* Jump alignment.  */
1554   "0:0:8",                              /* Label alignment.  */
1555   "16",                                 /* Func alignment.  */
1556   4,                                    /* Small unroll limit.  */
1557   2,                                    /* Small unroll factor.  */
1558 };
1559
1560 /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
1561     very small blocks it is better to use loop.  For large blocks, libcall
1562     can do nontemporary accesses and beat inline considerably.  */
1563 static stringop_algs znver2_memcpy[2] = {
1564   /* 32-bit tuning.  */
1565   {libcall, {{6, loop, false},
1566              {14, unrolled_loop, false},
1567              {-1, libcall, false}}},
1568   /* 64-bit tuning.  */
1569   {libcall, {{16, loop, false},
1570              {64, rep_prefix_4_byte, false},
1571              {-1, libcall, false}}}};
1572 static stringop_algs znver2_memset[2] = {
1573   /* 32-bit tuning.  */
1574   {libcall, {{8, loop, false},
1575              {24, unrolled_loop, false},
1576              {128, rep_prefix_4_byte, false},
1577              {-1, libcall, false}}},
1578   /* 64-bit tuning.  */
1579   {libcall, {{24, rep_prefix_4_byte, false},
1580              {128, rep_prefix_8_byte, false},
1581              {-1, libcall, false}}}};
1582
1583 struct processor_costs znver2_cost = {
1584   {
1585   /* Start of register allocator costs.  integer->integer move cost is 2. */
1586
1587   /* reg-reg moves are done by renaming and thus they are even cheaper than
1588      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1589      to doubles of latencies, we do not model this correctly.  It does not
1590      seem to make practical difference to bump prices up even more.  */
1591   6,                                    /* cost for loading QImode using
1592                                            movzbl.  */
1593   {6, 6, 6},                            /* cost of loading integer registers
1594                                            in QImode, HImode and SImode.
1595                                            Relative to reg-reg move (2).  */
1596   {8, 8, 8},                            /* cost of storing integer
1597                                            registers.  */
1598   2,                                    /* cost of reg,reg fld/fst.  */
1599   {6, 6, 16},                           /* cost of loading fp registers
1600                                            in SFmode, DFmode and XFmode.  */
1601   {8, 8, 16},                           /* cost of storing fp registers
1602                                            in SFmode, DFmode and XFmode.  */
1603   2,                                    /* cost of moving MMX register.  */
1604   {6, 6},                               /* cost of loading MMX registers
1605                                            in SImode and DImode.  */
1606   {8, 8},                               /* cost of storing MMX registers
1607                                            in SImode and DImode.  */
1608   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1609                                            register.  */
1610   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
1611                                            in 32,64,128,256 and 512-bit.  */
1612   {8, 8, 8, 8, 16},                     /* cost of storing SSE registers
1613                                            in 32,64,128,256 and 512-bit.  */
1614   6, 6,                                 /* SSE->integer and integer->SSE
1615                                            moves.  */
1616   8, 8,                         /* mask->integer and integer->mask moves */
1617   {6, 6, 6},                            /* cost of loading mask register
1618                                            in QImode, HImode, SImode.  */
1619   {8, 8, 8},                            /* cost if storing mask register
1620                                            in QImode, HImode, SImode.  */
1621   2,                                    /* cost of moving mask register.  */
1622   /* End of register allocator costs.  */
1623   },
1624
1625   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1626   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1627   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1628   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1629   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1630    COSTS_N_INSNS (3),                   /*                               HI.  */
1631    COSTS_N_INSNS (3),                   /*                               SI.  */
1632    COSTS_N_INSNS (3),                   /*                               DI.  */
1633    COSTS_N_INSNS (3)},                  /*                      other.  */
1634   0,                                    /* cost of multiply per each bit
1635                                            set.  */
1636    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1637       bound.  */
1638   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI.  */
1639    COSTS_N_INSNS (22),                  /*                          HI.  */
1640    COSTS_N_INSNS (30),                  /*                          SI.  */
1641    COSTS_N_INSNS (45),                  /*                          DI.  */
1642    COSTS_N_INSNS (45)},                 /*                          other.  */
1643   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1644   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1645   8,                                    /* "large" insn.  */
1646   9,                                    /* MOVE_RATIO.  */
1647   6,                                    /* CLEAR_RATIO */
1648   {6, 6, 6},                            /* cost of loading integer registers
1649                                            in QImode, HImode and SImode.
1650                                            Relative to reg-reg move (2).  */
1651   {8, 8, 8},                            /* cost of storing integer
1652                                            registers.  */
1653   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
1654                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1655   {8, 8, 8, 8, 16},                     /* cost of storing SSE register
1656                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1657   {6, 6, 6, 6, 12},                     /* cost of unaligned loads.  */
1658   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
1659   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1660                                            register.  */
1661   6,                                    /* cost of moving SSE register to integer.  */
1662   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1663      throughput 12.  Approx 9 uops do not depend on vector size and every load
1664      is 7 uops.  */
1665   18, 8,                                /* Gather load static, per_elt.  */
1666   18, 10,                               /* Gather store static, per_elt.  */
1667   32,                                   /* size of l1 cache.  */
1668   512,                                  /* size of l2 cache.  */
1669   64,                                   /* size of prefetch block.  */
1670   /* New AMD processors never drop prefetches; if they cannot be performed
1671      immediately, they are queued.  We set number of simultaneous prefetches
1672      to a large constant to reflect this (it probably is not a good idea not
1673      to limit number of prefetches at all, as their execution also takes some
1674      time).  */
1675   100,                                  /* number of parallel prefetches.  */
1676   3,                                    /* Branch cost.  */
1677   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1678   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1679   /* Latency of fdiv is 8-15.  */
1680   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1681   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1682   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1683   /* Latency of fsqrt is 4-10.  */
1684   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1685
1686   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1687   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1688   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1689   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
1690   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1691   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1692   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1693   /* 9-13.  */
1694   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1695   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1696   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1697   /* Zen can execute 4 integer operations per cycle.  FP operations
1698      take 3 cycles and it can execute 2 integer additions and 2
1699      multiplications thus reassociation may make sense up to with of 6.
1700      SPEC2k6 bencharks suggests
1701      that 4 works better than 6 probably due to register pressure.
1702
1703      Integer vector operations are taken by FP unit and execute 3 vector
1704      plus/minus operations per cycle but only one multiply.  This is adjusted
1705      in ix86_reassociation_width.  */
1706   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1707   znver2_memcpy,
1708   znver2_memset,
1709   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1710   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1711   "16",                                 /* Loop alignment.  */
1712   "16",                                 /* Jump alignment.  */
1713   "0:0:8",                              /* Label alignment.  */
1714   "16",                                 /* Func alignment.  */
1715   4,                                    /* Small unroll limit.  */
1716   2,                                    /* Small unroll factor.  */
1717 };
1718
1719 struct processor_costs znver3_cost = {
1720   {
1721   /* Start of register allocator costs.  integer->integer move cost is 2. */
1722
1723   /* reg-reg moves are done by renaming and thus they are even cheaper than
1724      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1725      to doubles of latencies, we do not model this correctly.  It does not
1726      seem to make practical difference to bump prices up even more.  */
1727   6,                                    /* cost for loading QImode using
1728                                            movzbl.  */
1729   {6, 6, 6},                            /* cost of loading integer registers
1730                                            in QImode, HImode and SImode.
1731                                            Relative to reg-reg move (2).  */
1732   {8, 8, 8},                            /* cost of storing integer
1733                                            registers.  */
1734   2,                                    /* cost of reg,reg fld/fst.  */
1735   {6, 6, 16},                           /* cost of loading fp registers
1736                                            in SFmode, DFmode and XFmode.  */
1737   {8, 8, 16},                           /* cost of storing fp registers
1738                                            in SFmode, DFmode and XFmode.  */
1739   2,                                    /* cost of moving MMX register.  */
1740   {6, 6},                               /* cost of loading MMX registers
1741                                            in SImode and DImode.  */
1742   {8, 8},                               /* cost of storing MMX registers
1743                                            in SImode and DImode.  */
1744   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1745                                            register.  */
1746   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
1747                                            in 32,64,128,256 and 512-bit.  */
1748   {8, 8, 8, 8, 16},                     /* cost of storing SSE registers
1749                                            in 32,64,128,256 and 512-bit.  */
1750   6, 6,                                 /* SSE->integer and integer->SSE
1751                                            moves.  */
1752   8, 8,                         /* mask->integer and integer->mask moves */
1753   {6, 6, 6},                            /* cost of loading mask register
1754                                            in QImode, HImode, SImode.  */
1755   {8, 8, 8},                            /* cost if storing mask register
1756                                            in QImode, HImode, SImode.  */
1757   2,                                    /* cost of moving mask register.  */
1758   /* End of register allocator costs.  */
1759   },
1760
1761   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1762   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1763   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1764   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1765   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1766    COSTS_N_INSNS (3),                   /*                               HI.  */
1767    COSTS_N_INSNS (3),                   /*                               SI.  */
1768    COSTS_N_INSNS (3),                   /*                               DI.  */
1769    COSTS_N_INSNS (3)},                  /*                      other.  */
1770   0,                                    /* cost of multiply per each bit
1771                                            set.  */
1772   {COSTS_N_INSNS (9),                   /* cost of a divide/mod for QI.  */
1773    COSTS_N_INSNS (10),                  /*                          HI.  */
1774    COSTS_N_INSNS (12),                  /*                          SI.  */
1775    COSTS_N_INSNS (17),                  /*                          DI.  */
1776    COSTS_N_INSNS (17)},                 /*                          other.  */
1777   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1778   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1779   8,                                    /* "large" insn.  */
1780   9,                                    /* MOVE_RATIO.  */
1781   6,                                    /* CLEAR_RATIO */
1782   {6, 6, 6},                            /* cost of loading integer registers
1783                                            in QImode, HImode and SImode.
1784                                            Relative to reg-reg move (2).  */
1785   {8, 8, 8},                            /* cost of storing integer
1786                                            registers.  */
1787   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
1788                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1789   {8, 8, 8, 8, 16},                     /* cost of storing SSE register
1790                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1791   {6, 6, 6, 6, 12},                     /* cost of unaligned loads.  */
1792   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
1793   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1794                                            register.  */
1795   6,                                    /* cost of moving SSE register to integer.  */
1796   /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
1797      throughput 9.  Approx 7 uops do not depend on vector size and every load
1798      is 4 uops.  */
1799   14, 8,                                /* Gather load static, per_elt.  */
1800   14, 10,                               /* Gather store static, per_elt.  */
1801   32,                                   /* size of l1 cache.  */
1802   512,                                  /* size of l2 cache.  */
1803   64,                                   /* size of prefetch block.  */
1804   /* New AMD processors never drop prefetches; if they cannot be performed
1805      immediately, they are queued.  We set number of simultaneous prefetches
1806      to a large constant to reflect this (it probably is not a good idea not
1807      to limit number of prefetches at all, as their execution also takes some
1808      time).  */
1809   100,                                  /* number of parallel prefetches.  */
1810   3,                                    /* Branch cost.  */
1811   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1812   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1813   /* Latency of fdiv is 8-15.  */
1814   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1815   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1816   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1817   /* Latency of fsqrt is 4-10.  */
1818   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1819
1820   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1821   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1822   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1823   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
1824   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1825   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1826   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1827   /* 9-13.  */
1828   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1829   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1830   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1831   /* Zen can execute 4 integer operations per cycle.  FP operations
1832      take 3 cycles and it can execute 2 integer additions and 2
1833      multiplications thus reassociation may make sense up to with of 6.
1834      SPEC2k6 bencharks suggests
1835      that 4 works better than 6 probably due to register pressure.
1836
1837      Integer vector operations are taken by FP unit and execute 3 vector
1838      plus/minus operations per cycle but only one multiply.  This is adjusted
1839      in ix86_reassociation_width.  */
1840   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1841   znver2_memcpy,
1842   znver2_memset,
1843   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1844   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1845   "16",                                 /* Loop alignment.  */
1846   "16",                                 /* Jump alignment.  */
1847   "0:0:8",                              /* Label alignment.  */
1848   "16",                                 /* Func alignment.  */
1849   4,                                    /* Small unroll limit.  */
1850   2,                                    /* Small unroll factor.  */
1851 };
1852
1853 /* This table currently replicates znver3_cost table. */
1854 struct processor_costs znver4_cost = {
1855   {
1856   /* Start of register allocator costs.  integer->integer move cost is 2. */
1857
1858   /* reg-reg moves are done by renaming and thus they are even cheaper than
1859      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1860      to doubles of latencies, we do not model this correctly.  It does not
1861      seem to make practical difference to bump prices up even more.  */
1862   6,                                    /* cost for loading QImode using
1863                                            movzbl.  */
1864   {6, 6, 6},                            /* cost of loading integer registers
1865                                            in QImode, HImode and SImode.
1866                                            Relative to reg-reg move (2).  */
1867   {8, 8, 8},                            /* cost of storing integer
1868                                            registers.  */
1869   2,                                    /* cost of reg,reg fld/fst.  */
1870   {14, 14, 17},                         /* cost of loading fp registers
1871                                            in SFmode, DFmode and XFmode.  */
1872   {12, 12, 16},                         /* cost of storing fp registers
1873                                            in SFmode, DFmode and XFmode.  */
1874   2,                                    /* cost of moving MMX register.  */
1875   {6, 6},                               /* cost of loading MMX registers
1876                                            in SImode and DImode.  */
1877   {8, 8},                               /* cost of storing MMX registers
1878                                            in SImode and DImode.  */
1879   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1880                                            register.  */
1881   {6, 6, 10, 10, 12},                   /* cost of loading SSE registers
1882                                            in 32,64,128,256 and 512-bit.  */
1883   {8, 8, 8, 12, 12},                    /* cost of storing SSE registers
1884                                            in 32,64,128,256 and 512-bit.  */
1885   6, 8,                                 /* SSE->integer and integer->SSE
1886                                            moves.  */
1887   8, 8,                                 /* mask->integer and integer->mask moves */
1888   {6, 6, 6},                            /* cost of loading mask register
1889                                            in QImode, HImode, SImode.  */
1890   {8, 8, 8},                            /* cost if storing mask register
1891                                            in QImode, HImode, SImode.  */
1892   2,                                    /* cost of moving mask register.  */
1893   /* End of register allocator costs.  */
1894   },
1895
1896   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1897   /* TODO: Lea with 3 components has cost 2.  */
1898   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1899   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1900   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1901   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1902    COSTS_N_INSNS (3),                   /*                               HI.  */
1903    COSTS_N_INSNS (3),                   /*                               SI.  */
1904    COSTS_N_INSNS (3),                   /*                               DI.  */
1905    COSTS_N_INSNS (3)},                  /*                      other.  */
1906   0,                                    /* cost of multiply per each bit
1907                                            set.  */
1908   {COSTS_N_INSNS (12),                  /* cost of a divide/mod for QI.  */
1909    COSTS_N_INSNS (13),                  /*                          HI.  */
1910    COSTS_N_INSNS (13),                  /*                          SI.  */
1911    COSTS_N_INSNS (18),                  /*                          DI.  */
1912    COSTS_N_INSNS (18)},                 /*                          other.  */
1913   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1914   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1915   8,                                    /* "large" insn.  */
1916   9,                                    /* MOVE_RATIO.  */
1917   6,                                    /* CLEAR_RATIO */
1918   {6, 6, 6},                            /* cost of loading integer registers
1919                                            in QImode, HImode and SImode.
1920                                            Relative to reg-reg move (2).  */
1921   {8, 8, 8},                            /* cost of storing integer
1922                                            registers.  */
1923   {6, 6, 10, 10, 12},                   /* cost of loading SSE registers
1924                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1925   {8, 8, 8, 12, 12},                    /* cost of storing SSE register
1926                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1927   {6, 6, 10, 10, 12},                   /* cost of unaligned loads.  */
1928   {8, 8, 8, 12, 12},                    /* cost of unaligned stores.  */
1929   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM
1930                                            register.  */
1931   6,                                    /* cost of moving SSE register to integer.  */
1932   /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
1933      throughput 5.  Approx 7 uops do not depend on vector size and every load
1934      is 5 uops.  */
1935   14, 10,                               /* Gather load static, per_elt.  */
1936   14, 20,                               /* Gather store static, per_elt.  */
1937   32,                                   /* size of l1 cache.  */
1938   1024,                                 /* size of l2 cache.  */
1939   64,                                   /* size of prefetch block.  */
1940   /* New AMD processors never drop prefetches; if they cannot be performed
1941      immediately, they are queued.  We set number of simultaneous prefetches
1942      to a large constant to reflect this (it probably is not a good idea not
1943      to limit number of prefetches at all, as their execution also takes some
1944      time).  */
1945   100,                                  /* number of parallel prefetches.  */
1946   3,                                    /* Branch cost.  */
1947   COSTS_N_INSNS (7),                    /* cost of FADD and FSUB insns.  */
1948   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
1949   /* Latency of fdiv is 8-15.  */
1950   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1951   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1952   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1953   /* Latency of fsqrt is 4-10.  */
1954   COSTS_N_INSNS (25),                   /* cost of FSQRT instruction.  */
1955
1956   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1957   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1958   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1959   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
1960   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
1961   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
1962   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
1963   /* 9-13.  */
1964   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1965   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1966   COSTS_N_INSNS (21),                   /* cost of SQRTSD instruction.  */
1967   /* Zen can execute 4 integer operations per cycle.  FP operations
1968      take 3 cycles and it can execute 2 integer additions and 2
1969      multiplications thus reassociation may make sense up to with of 6.
1970      SPEC2k6 bencharks suggests
1971      that 4 works better than 6 probably due to register pressure.
1972
1973      Integer vector operations are taken by FP unit and execute 3 vector
1974      plus/minus operations per cycle but only one multiply.  This is adjusted
1975      in ix86_reassociation_width.  */
1976   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1977   znver2_memcpy,
1978   znver2_memset,
1979   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1980   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1981   "16",                                 /* Loop alignment.  */
1982   "16",                                 /* Jump alignment.  */
1983   "0:0:8",                              /* Label alignment.  */
1984   "16",                                 /* Func alignment.  */
1985   4,                                    /* Small unroll limit.  */
1986   2,                                    /* Small unroll factor.  */
1987 };
1988
1989 /* This table currently replicates znver4_cost table. */
1990 struct processor_costs znver5_cost = {
1991   {
1992   /* Start of register allocator costs.  integer->integer move cost is 2. */
1993
1994   /* reg-reg moves are done by renaming and thus they are even cheaper than
1995      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1996      to doubles of latencies, we do not model this correctly.  It does not
1997      seem to make practical difference to bump prices up even more.  */
1998   6,                                    /* cost for loading QImode using
1999                                            movzbl.  */
2000   {6, 6, 6},                            /* cost of loading integer registers
2001                                            in QImode, HImode and SImode.
2002                                            Relative to reg-reg move (2).  */
2003   {8, 8, 8},                            /* cost of storing integer
2004                                            registers.  */
2005   2,                                    /* cost of reg,reg fld/fst.  */
2006   {14, 14, 17},                         /* cost of loading fp registers
2007                                            in SFmode, DFmode and XFmode.  */
2008   {12, 12, 16},                         /* cost of storing fp registers
2009                                            in SFmode, DFmode and XFmode.  */
2010   2,                                    /* cost of moving MMX register.  */
2011   {6, 6},                               /* cost of loading MMX registers
2012                                            in SImode and DImode.  */
2013   {8, 8},                               /* cost of storing MMX registers
2014                                            in SImode and DImode.  */
2015   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
2016                                            register.  */
2017   {6, 6, 10, 10, 12},                   /* cost of loading SSE registers
2018                                            in 32,64,128,256 and 512-bit.  */
2019   {8, 8, 8, 12, 12},                    /* cost of storing SSE registers
2020                                            in 32,64,128,256 and 512-bit.  */
2021   6, 8,                                 /* SSE->integer and integer->SSE
2022                                            moves.  */
2023   8, 8,                                 /* mask->integer and integer->mask moves */
2024   {6, 6, 6},                            /* cost of loading mask register
2025                                            in QImode, HImode, SImode.  */
2026   {8, 8, 8},                            /* cost if storing mask register
2027                                            in QImode, HImode, SImode.  */
2028   2,                                    /* cost of moving mask register.  */
2029   /* End of register allocator costs.  */
2030   },
2031
2032   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
2033   /* TODO: Lea with 3 components has cost 2.  */
2034   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
2035   COSTS_N_INSNS (1),                    /* variable shift costs.  */
2036   COSTS_N_INSNS (1),                    /* constant shift costs.  */
2037   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
2038    COSTS_N_INSNS (3),                   /*                               HI.  */
2039    COSTS_N_INSNS (3),                   /*                               SI.  */
2040    COSTS_N_INSNS (3),                   /*                               DI.  */
2041    COSTS_N_INSNS (3)},                  /*                      other.  */
2042   0,                                    /* cost of multiply per each bit
2043                                            set.  */
2044   {COSTS_N_INSNS (10),                  /* cost of a divide/mod for QI.  */
2045    COSTS_N_INSNS (11),                  /*                          HI.  */
2046    COSTS_N_INSNS (13),                  /*                          SI.  */
2047    COSTS_N_INSNS (16),                  /*                          DI.  */
2048    COSTS_N_INSNS (16)},                 /*                          other.  */
2049   COSTS_N_INSNS (1),                    /* cost of movsx.  */
2050   COSTS_N_INSNS (1),                    /* cost of movzx.  */
2051   8,                                    /* "large" insn.  */
2052   9,                                    /* MOVE_RATIO.  */
2053   6,                                    /* CLEAR_RATIO */
2054   {6, 6, 6},                            /* cost of loading integer registers
2055                                            in QImode, HImode and SImode.
2056                                            Relative to reg-reg move (2).  */
2057   {8, 8, 8},                            /* cost of storing integer
2058                                            registers.  */
2059   {6, 6, 10, 10, 12},                   /* cost of loading SSE registers
2060                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2061   {8, 8, 8, 12, 12},                    /* cost of storing SSE register
2062                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2063   {6, 6, 10, 10, 12},                   /* cost of unaligned loads.  */
2064   {8, 8, 8, 12, 12},                    /* cost of unaligned stores.  */
2065   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM
2066                                            register.  */
2067   6,                                    /* cost of moving SSE register to integer.  */
2068   /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
2069      throughput 5.  Approx 7 uops do not depend on vector size and every load
2070      is 5 uops.  */
2071   14, 10,                               /* Gather load static, per_elt.  */
2072   14, 20,                               /* Gather store static, per_elt.  */
2073   32,                                   /* size of l1 cache.  */
2074   1024,                                 /* size of l2 cache.  */
2075   64,                                   /* size of prefetch block.  */
2076   /* New AMD processors never drop prefetches; if they cannot be performed
2077      immediately, they are queued.  We set number of simultaneous prefetches
2078      to a large constant to reflect this (it probably is not a good idea not
2079      to limit number of prefetches at all, as their execution also takes some
2080      time).  */
2081   100,                                  /* number of parallel prefetches.  */
2082   3,                                    /* Branch cost.  */
2083   COSTS_N_INSNS (7),                    /* cost of FADD and FSUB insns.  */
2084   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
2085   /* Latency of fdiv is 8-15.  */
2086   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
2087   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2088   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2089   /* Latency of fsqrt is 4-10.  */
2090   COSTS_N_INSNS (25),                   /* cost of FSQRT instruction.  */
2091
2092   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2093   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2094   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
2095   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
2096   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
2097   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
2098   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
2099   /* 9-13.  */
2100   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
2101   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
2102   COSTS_N_INSNS (20),                   /* cost of SQRTSD instruction.  */
2103   /* Zen can execute 4 integer operations per cycle.  FP operations
2104      take 3 cycles and it can execute 2 integer additions and 2
2105      multiplications thus reassociation may make sense up to with of 6.
2106      SPEC2k6 bencharks suggests
2107      that 4 works better than 6 probably due to register pressure.
2108
2109      Integer vector operations are taken by FP unit and execute 3 vector
2110      plus/minus operations per cycle but only one multiply.  This is adjusted
2111      in ix86_reassociation_width.  */
2112   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
2113   znver2_memcpy,
2114   znver2_memset,
2115   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
2116   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
2117   "16",                                 /* Loop alignment.  */
2118   "16",                                 /* Jump alignment.  */
2119   "0:0:8",                              /* Label alignment.  */
2120   "16",                                 /* Func alignment.  */
2121   4,                                    /* Small unroll limit.  */
2122   2,                                    /* Small unroll factor.  */
2123 };
2124
2125 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
2126 static stringop_algs skylake_memcpy[2] =   {
2127   {libcall,
2128    {{256, rep_prefix_1_byte, true},
2129     {256, loop, false},
2130     {-1, libcall, false}}},
2131   {libcall,
2132    {{256, rep_prefix_1_byte, true},
2133     {256, loop, false},
2134     {-1, libcall, false}}}};
2135
2136 static stringop_algs skylake_memset[2] = {
2137   {libcall,
2138    {{256, rep_prefix_1_byte, true},
2139     {256, loop, false},
2140     {-1, libcall, false}}},
2141   {libcall,
2142    {{256, rep_prefix_1_byte, true},
2143     {256, loop, false},
2144     {-1, libcall, false}}}};
2145
2146 static const
2147 struct processor_costs skylake_cost = {
2148   {
2149   /* Start of register allocator costs.  integer->integer move cost is 2. */
2150   6,                                 /* cost for loading QImode using movzbl */
2151   {4, 4, 4},                            /* cost of loading integer registers
2152                                            in QImode, HImode and SImode.
2153                                            Relative to reg-reg move (2).  */
2154   {6, 6, 6},                            /* cost of storing integer registers */
2155   2,                                    /* cost of reg,reg fld/fst */
2156   {6, 6, 8},                            /* cost of loading fp registers
2157                                            in SFmode, DFmode and XFmode */
2158   {6, 6, 10},                           /* cost of storing fp registers
2159                                            in SFmode, DFmode and XFmode */
2160   2,                                    /* cost of moving MMX register */
2161   {6, 6},                               /* cost of loading MMX registers
2162                                            in SImode and DImode */
2163   {6, 6},                               /* cost of storing MMX registers
2164                                            in SImode and DImode */
2165   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2166   {6, 6, 6, 10, 20},                    /* cost of loading SSE registers
2167                                            in 32,64,128,256 and 512-bit */
2168   {8, 8, 8, 12, 24},                    /* cost of storing SSE registers
2169                                            in 32,64,128,256 and 512-bit */
2170   6, 6,                         /* SSE->integer and integer->SSE moves */
2171   6, 6,                         /* mask->integer and integer->mask moves */
2172   {8, 8, 8},                            /* cost of loading mask register
2173                                            in QImode, HImode, SImode.  */
2174   {6, 6, 6},                            /* cost if storing mask register
2175                                            in QImode, HImode, SImode.  */
2176   3,                                    /* cost of moving mask register.  */
2177   /* End of register allocator costs.  */
2178   },
2179
2180   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2181   COSTS_N_INSNS (1)+1,          /* cost of a lea instruction */
2182   COSTS_N_INSNS (1),                    /* variable shift costs */
2183   COSTS_N_INSNS (1),                    /* constant shift costs */
2184   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2185    COSTS_N_INSNS (3),                   /*                               HI */
2186    COSTS_N_INSNS (3),                   /*                               SI */
2187    COSTS_N_INSNS (3),                   /*                               DI */
2188    COSTS_N_INSNS (3)},                  /*                            other */
2189   0,                                    /* cost of multiply per each bit set */
2190   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2191      model is not realistic. We compensate by increasing the latencies a bit.  */
2192   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
2193    COSTS_N_INSNS (11),                  /*                          HI */
2194    COSTS_N_INSNS (14),                  /*                          SI */
2195    COSTS_N_INSNS (76),                  /*                          DI */
2196    COSTS_N_INSNS (76)},                 /*                          other */
2197   COSTS_N_INSNS (1),                    /* cost of movsx */
2198   COSTS_N_INSNS (0),                    /* cost of movzx */
2199   8,                                    /* "large" insn */
2200   17,                                   /* MOVE_RATIO */
2201   17,                                   /* CLEAR_RATIO */
2202   {6, 6, 6},                            /* cost of loading integer registers
2203                                            in QImode, HImode and SImode.
2204                                            Relative to reg-reg move (2).  */
2205   {8, 8, 8},                            /* cost of storing integer registers */
2206   {8, 8, 8, 8, 16},                     /* cost of loading SSE register
2207                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2208   {8, 8, 8, 8, 16},                     /* cost of storing SSE register
2209                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2210   {8, 8, 8, 8, 16},                     /* cost of unaligned loads.  */
2211   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
2212   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2213   6,                                    /* cost of moving SSE register to integer.  */
2214   20, 8,                                /* Gather load static, per_elt.  */
2215   22, 10,                               /* Gather store static, per_elt.  */
2216   64,                                   /* size of l1 cache.  */
2217   512,                                  /* size of l2 cache.  */
2218   64,                                   /* size of prefetch block */
2219   6,                                    /* number of parallel prefetches */
2220   3,                                    /* Branch cost */
2221   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2222   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
2223   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2224   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2225   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2226   COSTS_N_INSNS (20),                   /* cost of FSQRT instruction.  */
2227
2228   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2229   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2230   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2231   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
2232   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
2233   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
2234   COSTS_N_INSNS (11),                   /* cost of DIVSS instruction.  */
2235   COSTS_N_INSNS (14),                   /* cost of DIVSD instruction.  */
2236   COSTS_N_INSNS (12),                   /* cost of SQRTSS instruction.  */
2237   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
2238   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
2239   skylake_memcpy,
2240   skylake_memset,
2241   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2242   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2243   "16:11:8",                            /* Loop alignment.  */
2244   "16:11:8",                            /* Jump alignment.  */
2245   "0:0:8",                              /* Label alignment.  */
2246   "16",                                 /* Func alignment.  */
2247   4,                                    /* Small unroll limit.  */
2248   2,                                    /* Small unroll factor.  */
2249 };
2250
2251 /* icelake_cost should produce code tuned for Icelake family of CPUs.
2252    NB: rep_prefix_1_byte is used only for known size. */
2253
2254 static stringop_algs icelake_memcpy[2] =   {
2255   {libcall,
2256    {{256, rep_prefix_1_byte, true},
2257     {256, loop, false},
2258     {-1, libcall, false}}},
2259   {libcall,
2260    {{256, rep_prefix_1_byte, true},
2261     {256, loop, false},
2262     {-1, libcall, false}}}};
2263
2264 static stringop_algs icelake_memset[2] = {
2265   {libcall,
2266    {{256, rep_prefix_1_byte, true},
2267     {256, loop, false},
2268     {-1, libcall, false}}},
2269   {libcall,
2270    {{256, rep_prefix_1_byte, true},
2271     {256, loop, false},
2272     {-1, libcall, false}}}};
2273
2274 static const
2275 struct processor_costs icelake_cost = {
2276   {
2277   /* Start of register allocator costs.  integer->integer move cost is 2. */
2278   6,                                 /* cost for loading QImode using movzbl */
2279   {4, 4, 4},                            /* cost of loading integer registers
2280                                            in QImode, HImode and SImode.
2281                                            Relative to reg-reg move (2).  */
2282   {6, 6, 6},                            /* cost of storing integer registers */
2283   2,                                    /* cost of reg,reg fld/fst */
2284   {6, 6, 8},                            /* cost of loading fp registers
2285                                            in SFmode, DFmode and XFmode */
2286   {6, 6, 10},                           /* cost of storing fp registers
2287                                            in SFmode, DFmode and XFmode */
2288   2,                                    /* cost of moving MMX register */
2289   {6, 6},                               /* cost of loading MMX registers
2290                                            in SImode and DImode */
2291   {6, 6},                               /* cost of storing MMX registers
2292                                            in SImode and DImode */
2293   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2294   {6, 6, 6, 10, 20},                    /* cost of loading SSE registers
2295                                            in 32,64,128,256 and 512-bit */
2296   {8, 8, 8, 12, 24},                    /* cost of storing SSE registers
2297                                            in 32,64,128,256 and 512-bit */
2298   6, 6,                         /* SSE->integer and integer->SSE moves */
2299   6, 6,                         /* mask->integer and integer->mask moves */
2300   {8, 8, 8},                            /* cost of loading mask register
2301                                            in QImode, HImode, SImode.  */
2302   {6, 6, 6},                            /* cost if storing mask register
2303                                            in QImode, HImode, SImode.  */
2304   3,                                    /* cost of moving mask register.  */
2305   /* End of register allocator costs.  */
2306   },
2307
2308   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2309   COSTS_N_INSNS (1)+1,          /* cost of a lea instruction */
2310   COSTS_N_INSNS (1),                    /* variable shift costs */
2311   COSTS_N_INSNS (1),                    /* constant shift costs */
2312   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2313    COSTS_N_INSNS (3),                   /*                               HI */
2314    COSTS_N_INSNS (3),                   /*                               SI */
2315    COSTS_N_INSNS (3),                   /*                               DI */
2316    COSTS_N_INSNS (3)},                  /*                            other */
2317   0,                                    /* cost of multiply per each bit set */
2318   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2319      model is not realistic. We compensate by increasing the latencies a bit.  */
2320   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
2321    COSTS_N_INSNS (11),                  /*                          HI */
2322    COSTS_N_INSNS (14),                  /*                          SI */
2323    COSTS_N_INSNS (76),                  /*                          DI */
2324    COSTS_N_INSNS (76)},                 /*                          other */
2325   COSTS_N_INSNS (1),                    /* cost of movsx */
2326   COSTS_N_INSNS (0),                    /* cost of movzx */
2327   8,                                    /* "large" insn */
2328   17,                                   /* MOVE_RATIO */
2329   17,                                   /* CLEAR_RATIO */
2330   {6, 6, 6},                            /* cost of loading integer registers
2331                                            in QImode, HImode and SImode.
2332                                            Relative to reg-reg move (2).  */
2333   {8, 8, 8},                            /* cost of storing integer registers */
2334   {8, 8, 8, 8, 16},                     /* cost of loading SSE register
2335                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2336   {8, 8, 8, 8, 16},                     /* cost of storing SSE register
2337                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2338   {8, 8, 8, 8, 16},                     /* cost of unaligned loads.  */
2339   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
2340   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2341   6,                                    /* cost of moving SSE register to integer.  */
2342   20, 8,                                /* Gather load static, per_elt.  */
2343   22, 10,                               /* Gather store static, per_elt.  */
2344   64,                                   /* size of l1 cache.  */
2345   512,                                  /* size of l2 cache.  */
2346   64,                                   /* size of prefetch block */
2347   6,                                    /* number of parallel prefetches */
2348   3,                                    /* Branch cost */
2349   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2350   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
2351   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2352   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2353   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2354   COSTS_N_INSNS (20),                   /* cost of FSQRT instruction.  */
2355
2356   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2357   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2358   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2359   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
2360   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
2361   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
2362   COSTS_N_INSNS (11),                   /* cost of DIVSS instruction.  */
2363   COSTS_N_INSNS (14),                   /* cost of DIVSD instruction.  */
2364   COSTS_N_INSNS (12),                   /* cost of SQRTSS instruction.  */
2365   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
2366   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
2367   icelake_memcpy,
2368   icelake_memset,
2369   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2370   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2371   "16:11:8",                            /* Loop alignment.  */
2372   "16:11:8",                            /* Jump alignment.  */
2373   "0:0:8",                              /* Label alignment.  */
2374   "16",                                 /* Func alignment.  */
2375   4,                                    /* Small unroll limit.  */
2376   2,                                    /* Small unroll factor.  */
2377 };
2378
2379 /* alderlake_cost should produce code tuned for alderlake family of CPUs.  */
2380 static stringop_algs alderlake_memcpy[2] = {
2381   {libcall,
2382    {{256, rep_prefix_1_byte, true},
2383     {256, loop, false},
2384     {-1, libcall, false}}},
2385   {libcall,
2386    {{256, rep_prefix_1_byte, true},
2387     {256, loop, false},
2388     {-1, libcall, false}}}};
2389 static stringop_algs alderlake_memset[2] = {
2390   {libcall,
2391    {{256, rep_prefix_1_byte, true},
2392     {256, loop, false},
2393     {-1, libcall, false}}},
2394   {libcall,
2395    {{256, rep_prefix_1_byte, true},
2396     {256, loop, false},
2397     {-1, libcall, false}}}};
2398 static const
2399 struct processor_costs alderlake_cost = {
2400   {
2401   /* Start of register allocator costs.  integer->integer move cost is 2.  */
2402   6,                                 /* cost for loading QImode using movzbl */
2403   {6, 6, 6},                            /* cost of loading integer registers
2404                                            in QImode, HImode and SImode.
2405                                            Relative to reg-reg move (2).  */
2406   {6, 6, 6},                            /* cost of storing integer registers */
2407   4,                                    /* cost of reg,reg fld/fst */
2408   {6, 6, 12},                           /* cost of loading fp registers
2409                                            in SFmode, DFmode and XFmode */
2410   {6, 6, 12},                           /* cost of storing fp registers
2411                                            in SFmode, DFmode and XFmode */
2412   2,                                    /* cost of moving MMX register */
2413   {6, 6},                               /* cost of loading MMX registers
2414                                            in SImode and DImode */
2415   {6, 6},                               /* cost of storing MMX registers
2416                                            in SImode and DImode */
2417   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
2418   {6, 6, 6, 10, 15},                    /* cost of loading SSE registers
2419                                            in 32,64,128,256 and 512-bit */
2420   {6, 6, 6, 10, 15},                    /* cost of storing SSE registers
2421                                            in 32,64,128,256 and 512-bit */
2422   6, 6,                         /* SSE->integer and integer->SSE moves */
2423   6, 6,                         /* mask->integer and integer->mask moves */
2424   {6, 6, 6},                            /* cost of loading mask register
2425                                            in QImode, HImode, SImode.  */
2426   {6, 6, 6},                    /* cost if storing mask register
2427                                            in QImode, HImode, SImode.  */
2428   2,                                    /* cost of moving mask register.  */
2429   /* End of register allocator costs.  */
2430   },
2431
2432   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2433   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2434   COSTS_N_INSNS (1),                    /* variable shift costs */
2435   COSTS_N_INSNS (1),                    /* constant shift costs */
2436   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2437    COSTS_N_INSNS (3),                   /*                               HI */
2438    COSTS_N_INSNS (3),                   /*                               SI */
2439    COSTS_N_INSNS (3),                   /*                               DI */
2440    COSTS_N_INSNS (4)},                  /*                            other */
2441   0,                                    /* cost of multiply per each bit set */
2442   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI */
2443    COSTS_N_INSNS (22),                  /*                          HI */
2444    COSTS_N_INSNS (30),                  /*                          SI */
2445    COSTS_N_INSNS (74),                  /*                          DI */
2446    COSTS_N_INSNS (74)},                 /*                          other */
2447   COSTS_N_INSNS (1),                    /* cost of movsx */
2448   COSTS_N_INSNS (1),                    /* cost of movzx */
2449   8,                                    /* "large" insn */
2450   17,                                   /* MOVE_RATIO */
2451   17,                                   /* CLEAR_RATIO */
2452   {6, 6, 6},                            /* cost of loading integer registers
2453                                            in QImode, HImode and SImode.
2454                                            Relative to reg-reg move (2).  */
2455   {8, 8, 8},                            /* cost of storing integer registers */
2456   {8, 8, 8, 10, 15},                    /* cost of loading SSE register
2457                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2458   {8, 8, 8, 10, 15},                    /* cost of storing SSE register
2459                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2460   {8, 8, 8, 10, 15},                    /* cost of unaligned loads.  */
2461   {8, 8, 8, 10, 15},                    /* cost of unaligned storess.  */
2462   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
2463   6,                                    /* cost of moving SSE register to integer.  */
2464   18, 6,                                /* Gather load static, per_elt.  */
2465   18, 6,                                /* Gather store static, per_elt.  */
2466   32,                                   /* size of l1 cache.  */
2467   512,                                  /* size of l2 cache.  */
2468   64,                                   /* size of prefetch block */
2469   6,                                    /* number of parallel prefetches */
2470   3,                                    /* Branch cost */
2471   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2472   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
2473   COSTS_N_INSNS (17),                   /* cost of FDIV instruction.  */
2474   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2475   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2476   COSTS_N_INSNS (14),                   /* cost of FSQRT instruction.  */
2477
2478   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2479   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2480   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2481   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2482   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2483   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2484   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
2485   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
2486   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
2487   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
2488   1, 4, 3, 3,                           /* reassoc int, fp, vec_int, vec_fp.  */
2489   alderlake_memcpy,
2490   alderlake_memset,
2491   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
2492   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
2493   "16:11:8",                            /* Loop alignment.  */
2494   "16:11:8",                            /* Jump alignment.  */
2495   "0:0:8",                              /* Label alignment.  */
2496   "16",                                 /* Func alignment.  */
2497   4,                                    /* Small unroll limit.  */
2498   2,                                    /* Small unroll factor.  */
2499 };
2500
2501   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
2502      very small blocks it is better to use loop. For large blocks, libcall can
2503      do nontemporary accesses and beat inline considerably.  */
2504 static stringop_algs btver1_memcpy[2] = {
2505   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
2506              {-1, rep_prefix_4_byte, false}}},
2507   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2508              {-1, libcall, false}}}};
2509 static stringop_algs btver1_memset[2] = {
2510   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
2511              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2512   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2513              {-1, libcall, false}}}};
2514 const struct processor_costs btver1_cost = {
2515   {
2516   /* Start of register allocator costs.  integer->integer move cost is 2. */
2517   8,                                 /* cost for loading QImode using movzbl */
2518   {6, 8, 6},                            /* cost of loading integer registers
2519                                            in QImode, HImode and SImode.
2520                                            Relative to reg-reg move (2).  */
2521   {6, 8, 6},                            /* cost of storing integer registers */
2522   4,                                    /* cost of reg,reg fld/fst */
2523   {12, 12, 28},                         /* cost of loading fp registers
2524                                            in SFmode, DFmode and XFmode */
2525   {12, 12, 38},                         /* cost of storing fp registers
2526                                            in SFmode, DFmode and XFmode */
2527   4,                                    /* cost of moving MMX register */
2528   {10, 10},                             /* cost of loading MMX registers
2529                                            in SImode and DImode */
2530   {12, 12},                             /* cost of storing MMX registers
2531                                            in SImode and DImode */
2532   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2533   {10, 10, 12, 48, 96},                 /* cost of loading SSE registers
2534                                            in 32,64,128,256 and 512-bit */
2535   {10, 10, 12, 48, 96},                 /* cost of storing SSE registers
2536                                            in 32,64,128,256 and 512-bit */
2537   14, 14,                               /* SSE->integer and integer->SSE moves */
2538   14, 14,                               /* mask->integer and integer->mask moves */
2539   {6, 8, 6},                            /* cost of loading mask register
2540                                            in QImode, HImode, SImode.  */
2541   {6, 8, 6},                            /* cost if storing mask register
2542                                            in QImode, HImode, SImode.  */
2543   2,                                    /* cost of moving mask register.  */
2544   /* End of register allocator costs.  */
2545   },
2546
2547   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2548   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
2549   COSTS_N_INSNS (1),                    /* variable shift costs */
2550   COSTS_N_INSNS (1),                    /* constant shift costs */
2551   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2552    COSTS_N_INSNS (4),                   /*                               HI */
2553    COSTS_N_INSNS (3),                   /*                               SI */
2554    COSTS_N_INSNS (4),                   /*                               DI */
2555    COSTS_N_INSNS (5)},                  /*                            other */
2556   0,                                    /* cost of multiply per each bit set */
2557   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
2558    COSTS_N_INSNS (35),                  /*                          HI */
2559    COSTS_N_INSNS (51),                  /*                          SI */
2560    COSTS_N_INSNS (83),                  /*                          DI */
2561    COSTS_N_INSNS (83)},                 /*                          other */
2562   COSTS_N_INSNS (1),                    /* cost of movsx */
2563   COSTS_N_INSNS (1),                    /* cost of movzx */
2564   8,                                    /* "large" insn */
2565   9,                                    /* MOVE_RATIO */
2566   6,                                    /* CLEAR_RATIO */
2567   {6, 8, 6},                            /* cost of loading integer registers
2568                                            in QImode, HImode and SImode.
2569                                            Relative to reg-reg move (2).  */
2570   {6, 8, 6},                            /* cost of storing integer registers */
2571   {10, 10, 12, 48, 96},                 /* cost of loading SSE register
2572                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2573   {10, 10, 12, 48, 96},                 /* cost of storing SSE register
2574                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2575   {10, 10, 12, 48, 96},                 /* cost of unaligned loads.  */
2576   {10, 10, 12, 48, 96},                 /* cost of unaligned stores.  */
2577   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2578   14,                                   /* cost of moving SSE register to integer.  */
2579   10, 10,                               /* Gather load static, per_elt.  */
2580   10, 10,                               /* Gather store static, per_elt.  */
2581   32,                                   /* size of l1 cache.  */
2582   512,                                  /* size of l2 cache.  */
2583   64,                                   /* size of prefetch block */
2584   100,                                  /* number of parallel prefetches */
2585   2,                                    /* Branch cost */
2586   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
2587   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
2588   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
2589   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
2590   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
2591   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
2592
2593   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2594   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2595   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
2596   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
2597   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2598   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2599   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
2600   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
2601   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
2602   COSTS_N_INSNS (48),                   /* cost of SQRTSD instruction.  */
2603   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2604   btver1_memcpy,
2605   btver1_memset,
2606   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
2607   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2608   "16:11:8",                            /* Loop alignment.  */
2609   "16:8:8",                             /* Jump alignment.  */
2610   "0:0:8",                              /* Label alignment.  */
2611   "11",                                 /* Func alignment.  */
2612   4,                                    /* Small unroll limit.  */
2613   2,                                    /* Small unroll factor.  */
2614 };
2615
2616 static stringop_algs btver2_memcpy[2] = {
2617   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
2618              {-1, rep_prefix_4_byte, false}}},
2619   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2620              {-1, libcall, false}}}};
2621 static stringop_algs btver2_memset[2] = {
2622   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
2623              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2624   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2625              {-1, libcall, false}}}};
2626 const struct processor_costs btver2_cost = {
2627   {
2628   /* Start of register allocator costs.  integer->integer move cost is 2. */
2629   8,                                 /* cost for loading QImode using movzbl */
2630   {8, 8, 6},                            /* cost of loading integer registers
2631                                            in QImode, HImode and SImode.
2632                                            Relative to reg-reg move (2).  */
2633   {8, 8, 6},                            /* cost of storing integer registers */
2634   4,                                    /* cost of reg,reg fld/fst */
2635   {12, 12, 28},                         /* cost of loading fp registers
2636                                            in SFmode, DFmode and XFmode */
2637   {12, 12, 38},                         /* cost of storing fp registers
2638                                            in SFmode, DFmode and XFmode */
2639   4,                                    /* cost of moving MMX register */
2640   {10, 10},                             /* cost of loading MMX registers
2641                                            in SImode and DImode */
2642   {12, 12},                             /* cost of storing MMX registers
2643                                            in SImode and DImode */
2644   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2645   {10, 10, 12, 48, 96},                 /* cost of loading SSE registers
2646                                            in 32,64,128,256 and 512-bit */
2647   {10, 10, 12, 48, 96},                 /* cost of storing SSE registers
2648                                            in 32,64,128,256 and 512-bit */
2649   14, 14,                               /* SSE->integer and integer->SSE moves */
2650   14, 14,                               /* mask->integer and integer->mask moves */
2651   {8, 8, 6},                            /* cost of loading mask register
2652                                            in QImode, HImode, SImode.  */
2653   {8, 8, 6},                            /* cost if storing mask register
2654                                            in QImode, HImode, SImode.  */
2655   2,                                    /* cost of moving mask register.  */
2656   /* End of register allocator costs.  */
2657   },
2658
2659   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2660   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
2661   COSTS_N_INSNS (1),                    /* variable shift costs */
2662   COSTS_N_INSNS (1),                    /* constant shift costs */
2663   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2664    COSTS_N_INSNS (4),                   /*                               HI */
2665    COSTS_N_INSNS (3),                   /*                               SI */
2666    COSTS_N_INSNS (4),                   /*                               DI */
2667    COSTS_N_INSNS (5)},                  /*                            other */
2668   0,                                    /* cost of multiply per each bit set */
2669   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
2670    COSTS_N_INSNS (35),                  /*                          HI */
2671    COSTS_N_INSNS (51),                  /*                          SI */
2672    COSTS_N_INSNS (83),                  /*                          DI */
2673    COSTS_N_INSNS (83)},                 /*                          other */
2674   COSTS_N_INSNS (1),                    /* cost of movsx */
2675   COSTS_N_INSNS (1),                    /* cost of movzx */
2676   8,                                    /* "large" insn */
2677   9,                                    /* MOVE_RATIO */
2678   6,                                    /* CLEAR_RATIO */
2679   {8, 8, 6},                            /* cost of loading integer registers
2680                                            in QImode, HImode and SImode.
2681                                            Relative to reg-reg move (2).  */
2682   {8, 8, 6},                            /* cost of storing integer registers */
2683   {10, 10, 12, 48, 96},                 /* cost of loading SSE register
2684                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2685   {10, 10, 12, 48, 96},                 /* cost of storing SSE register
2686                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2687   {10, 10, 12, 48, 96},                 /* cost of unaligned loads.  */
2688   {10, 10, 12, 48, 96},                 /* cost of unaligned stores.  */
2689   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2690   14,                                   /* cost of moving SSE register to integer.  */
2691   10, 10,                               /* Gather load static, per_elt.  */
2692   10, 10,                               /* Gather store static, per_elt.  */
2693   32,                                   /* size of l1 cache.  */
2694   2048,                                 /* size of l2 cache.  */
2695   64,                                   /* size of prefetch block */
2696   100,                                  /* number of parallel prefetches */
2697   2,                                    /* Branch cost */
2698   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
2699   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
2700   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
2701   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
2702   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
2703   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
2704
2705   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2706   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2707   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
2708   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
2709   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2710   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2711   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
2712   COSTS_N_INSNS (19),                   /* cost of DIVSD instruction.  */
2713   COSTS_N_INSNS (16),                   /* cost of SQRTSS instruction.  */
2714   COSTS_N_INSNS (21),                   /* cost of SQRTSD instruction.  */
2715   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2716   btver2_memcpy,
2717   btver2_memset,
2718   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
2719   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2720   "16:11:8",                            /* Loop alignment.  */
2721   "16:8:8",                             /* Jump alignment.  */
2722   "0:0:8",                              /* Label alignment.  */
2723   "11",                                 /* Func alignment.  */
2724   4,                                    /* Small unroll limit.  */
2725   2,                                    /* Small unroll factor.  */
2726 };
2727
2728 static stringop_algs pentium4_memcpy[2] = {
2729   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2730   DUMMY_STRINGOP_ALGS};
2731 static stringop_algs pentium4_memset[2] = {
2732   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2733              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2734   DUMMY_STRINGOP_ALGS};
2735
2736 static const
2737 struct processor_costs pentium4_cost = {
2738   {
2739   /* Start of register allocator costs.  integer->integer move cost is 2. */
2740   5,                                 /* cost for loading QImode using movzbl */
2741   {4, 5, 4},                            /* cost of loading integer registers
2742                                            in QImode, HImode and SImode.
2743                                            Relative to reg-reg move (2).  */
2744   {2, 3, 2},                            /* cost of storing integer registers */
2745   12,                                   /* cost of reg,reg fld/fst */
2746   {14, 14, 14},                         /* cost of loading fp registers
2747                                            in SFmode, DFmode and XFmode */
2748   {14, 14, 14},                         /* cost of storing fp registers
2749                                            in SFmode, DFmode and XFmode */
2750   12,                                   /* cost of moving MMX register */
2751   {16, 16},                             /* cost of loading MMX registers
2752                                            in SImode and DImode */
2753   {16, 16},                             /* cost of storing MMX registers
2754                                            in SImode and DImode */
2755   12, 24, 48,                           /* cost of moving XMM,YMM,ZMM register */
2756   {16, 16, 16, 32, 64},                 /* cost of loading SSE registers
2757                                            in 32,64,128,256 and 512-bit */
2758   {16, 16, 16, 32, 64},                 /* cost of storing SSE registers
2759                                            in 32,64,128,256 and 512-bit */
2760   20, 12,                               /* SSE->integer and integer->SSE moves */
2761   20, 12,                               /* mask->integer and integer->mask moves */
2762   {4, 5, 4},                            /* cost of loading mask register
2763                                            in QImode, HImode, SImode.  */
2764   {2, 3, 2},                            /* cost if storing mask register
2765                                            in QImode, HImode, SImode.  */
2766   2,                                    /* cost of moving mask register.  */
2767   /* End of register allocator costs.  */
2768   },
2769
2770   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2771   COSTS_N_INSNS (3),                    /* cost of a lea instruction */
2772   COSTS_N_INSNS (4),                    /* variable shift costs */
2773   COSTS_N_INSNS (4),                    /* constant shift costs */
2774   {COSTS_N_INSNS (15),                  /* cost of starting multiply for QI */
2775    COSTS_N_INSNS (15),                  /*                               HI */
2776    COSTS_N_INSNS (15),                  /*                               SI */
2777    COSTS_N_INSNS (15),                  /*                               DI */
2778    COSTS_N_INSNS (15)},                 /*                            other */
2779   0,                                    /* cost of multiply per each bit set */
2780   {COSTS_N_INSNS (56),                  /* cost of a divide/mod for QI */
2781    COSTS_N_INSNS (56),                  /*                          HI */
2782    COSTS_N_INSNS (56),                  /*                          SI */
2783    COSTS_N_INSNS (56),                  /*                          DI */
2784    COSTS_N_INSNS (56)},                 /*                          other */
2785   COSTS_N_INSNS (1),                    /* cost of movsx */
2786   COSTS_N_INSNS (1),                    /* cost of movzx */
2787   16,                                   /* "large" insn */
2788   6,                                    /* MOVE_RATIO */
2789   6,                                    /* CLEAR_RATIO */
2790   {4, 5, 4},                            /* cost of loading integer registers
2791                                            in QImode, HImode and SImode.
2792                                            Relative to reg-reg move (2).  */
2793   {2, 3, 2},                            /* cost of storing integer registers */
2794   {16, 16, 16, 32, 64},                 /* cost of loading SSE register
2795                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2796   {16, 16, 16, 32, 64},                 /* cost of storing SSE register
2797                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2798   {32, 32, 32, 64, 128},                /* cost of unaligned loads.  */
2799   {32, 32, 32, 64, 128},                /* cost of unaligned stores.  */
2800   12, 24, 48,                           /* cost of moving XMM,YMM,ZMM register */
2801   20,                                   /* cost of moving SSE register to integer.  */
2802   16, 16,                               /* Gather load static, per_elt.  */
2803   16, 16,                               /* Gather store static, per_elt.  */
2804   8,                                    /* size of l1 cache.  */
2805   256,                                  /* size of l2 cache.  */
2806   64,                                   /* size of prefetch block */
2807   6,                                    /* number of parallel prefetches */
2808   2,                                    /* Branch cost */
2809   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
2810   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
2811   COSTS_N_INSNS (43),                   /* cost of FDIV instruction.  */
2812   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
2813   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
2814   COSTS_N_INSNS (43),                   /* cost of FSQRT instruction.  */
2815
2816   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
2817   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2818   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
2819   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
2820   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2821   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2822   COSTS_N_INSNS (23),                   /* cost of DIVSS instruction.  */
2823   COSTS_N_INSNS (38),                   /* cost of DIVSD instruction.  */
2824   COSTS_N_INSNS (23),                   /* cost of SQRTSS instruction.  */
2825   COSTS_N_INSNS (38),                   /* cost of SQRTSD instruction.  */
2826   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2827   pentium4_memcpy,
2828   pentium4_memset,
2829   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2830   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2831   NULL,                                 /* Loop alignment.  */
2832   NULL,                                 /* Jump alignment.  */
2833   NULL,                                 /* Label alignment.  */
2834   NULL,                                 /* Func alignment.  */
2835   4,                                    /* Small unroll limit.  */
2836   2,                                    /* Small unroll factor.  */
2837 };
2838
2839 static stringop_algs nocona_memcpy[2] = {
2840   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2841   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
2842              {100000, unrolled_loop, false}, {-1, libcall, false}}}};
2843
2844 static stringop_algs nocona_memset[2] = {
2845   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2846              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2847   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
2848              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2849
2850 static const
2851 struct processor_costs nocona_cost = {
2852   {
2853   /* Start of register allocator costs.  integer->integer move cost is 2. */
2854   4,                                 /* cost for loading QImode using movzbl */
2855   {4, 4, 4},                            /* cost of loading integer registers
2856                                            in QImode, HImode and SImode.
2857                                            Relative to reg-reg move (2).  */
2858   {4, 4, 4},                            /* cost of storing integer registers */
2859   12,                                   /* cost of reg,reg fld/fst */
2860   {14, 14, 14},                         /* cost of loading fp registers
2861                                            in SFmode, DFmode and XFmode */
2862   {14, 14, 14},                         /* cost of storing fp registers
2863                                            in SFmode, DFmode and XFmode */
2864   14,                                   /* cost of moving MMX register */
2865   {12, 12},                             /* cost of loading MMX registers
2866                                            in SImode and DImode */
2867   {12, 12},                             /* cost of storing MMX registers
2868                                            in SImode and DImode */
2869   6, 12, 24,                            /* cost of moving XMM,YMM,ZMM register */
2870   {12, 12, 12, 24, 48},                 /* cost of loading SSE registers
2871                                            in 32,64,128,256 and 512-bit */
2872   {12, 12, 12, 24, 48},                 /* cost of storing SSE registers
2873                                            in 32,64,128,256 and 512-bit */
2874   20, 12,                               /* SSE->integer and integer->SSE moves */
2875   20, 12,                               /* mask->integer and integer->mask moves */
2876   {4, 4, 4},                            /* cost of loading mask register
2877                                            in QImode, HImode, SImode.  */
2878   {4, 4, 4},                            /* cost if storing mask register
2879                                            in QImode, HImode, SImode.  */
2880   2,                                    /* cost of moving mask register.  */
2881   /* End of register allocator costs.  */
2882   },
2883
2884   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2885   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
2886   COSTS_N_INSNS (1),                    /* variable shift costs */
2887   COSTS_N_INSNS (1),                    /* constant shift costs */
2888   {COSTS_N_INSNS (10),                  /* cost of starting multiply for QI */
2889    COSTS_N_INSNS (10),                  /*                               HI */
2890    COSTS_N_INSNS (10),                  /*                               SI */
2891    COSTS_N_INSNS (10),                  /*                               DI */
2892    COSTS_N_INSNS (10)},                 /*                            other */
2893   0,                                    /* cost of multiply per each bit set */
2894   {COSTS_N_INSNS (66),                  /* cost of a divide/mod for QI */
2895    COSTS_N_INSNS (66),                  /*                          HI */
2896    COSTS_N_INSNS (66),                  /*                          SI */
2897    COSTS_N_INSNS (66),                  /*                          DI */
2898    COSTS_N_INSNS (66)},                 /*                          other */
2899   COSTS_N_INSNS (1),                    /* cost of movsx */
2900   COSTS_N_INSNS (1),                    /* cost of movzx */
2901   16,                                   /* "large" insn */
2902   17,                                   /* MOVE_RATIO */
2903   6,                                    /* CLEAR_RATIO */
2904   {4, 4, 4},                            /* cost of loading integer registers
2905                                            in QImode, HImode and SImode.
2906                                            Relative to reg-reg move (2).  */
2907   {4, 4, 4},                            /* cost of storing integer registers */
2908   {12, 12, 12, 24, 48},                 /* cost of loading SSE register
2909                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2910   {12, 12, 12, 24, 48},                 /* cost of storing SSE register
2911                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2912   {24, 24, 24, 48, 96},                 /* cost of unaligned loads.  */
2913   {24, 24, 24, 48, 96},                 /* cost of unaligned stores.  */
2914   6, 12, 24,                            /* cost of moving XMM,YMM,ZMM register */
2915   20,                                   /* cost of moving SSE register to integer.  */
2916   12, 12,                               /* Gather load static, per_elt.  */
2917   12, 12,                               /* Gather store static, per_elt.  */
2918   8,                                    /* size of l1 cache.  */
2919   1024,                                 /* size of l2 cache.  */
2920   64,                                   /* size of prefetch block */
2921   8,                                    /* number of parallel prefetches */
2922   1,                                    /* Branch cost */
2923   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
2924   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2925   COSTS_N_INSNS (40),                   /* cost of FDIV instruction.  */
2926   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
2927   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
2928   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
2929
2930   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
2931   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2932   COSTS_N_INSNS (7),                    /* cost of MULSS instruction.  */
2933   COSTS_N_INSNS (7),                    /* cost of MULSD instruction.  */
2934   COSTS_N_INSNS (7),                    /* cost of FMA SS instruction.  */
2935   COSTS_N_INSNS (7),                    /* cost of FMA SD instruction.  */
2936   COSTS_N_INSNS (32),                   /* cost of DIVSS instruction.  */
2937   COSTS_N_INSNS (40),                   /* cost of DIVSD instruction.  */
2938   COSTS_N_INSNS (32),                   /* cost of SQRTSS instruction.  */
2939   COSTS_N_INSNS (41),                   /* cost of SQRTSD instruction.  */
2940   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2941   nocona_memcpy,
2942   nocona_memset,
2943   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2944   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2945   NULL,                                 /* Loop alignment.  */
2946   NULL,                                 /* Jump alignment.  */
2947   NULL,                                 /* Label alignment.  */
2948   NULL,                                 /* Func alignment.  */
2949   4,                                    /* Small unroll limit.  */
2950   2,                                    /* Small unroll factor.  */
2951 };
2952
2953 static stringop_algs atom_memcpy[2] = {
2954   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2955   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2956              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2957 static stringop_algs atom_memset[2] = {
2958   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2959              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2960   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2961              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2962 static const
2963 struct processor_costs atom_cost = {
2964   {
2965   /* Start of register allocator costs.  integer->integer move cost is 2. */
2966   6,                                    /* cost for loading QImode using movzbl */
2967   {6, 6, 6},                            /* cost of loading integer registers
2968                                            in QImode, HImode and SImode.
2969                                            Relative to reg-reg move (2).  */
2970   {6, 6, 6},                            /* cost of storing integer registers */
2971   4,                                    /* cost of reg,reg fld/fst */
2972   {6, 6, 18},                           /* cost of loading fp registers
2973                                            in SFmode, DFmode and XFmode */
2974   {14, 14, 24},                         /* cost of storing fp registers
2975                                            in SFmode, DFmode and XFmode */
2976   2,                                    /* cost of moving MMX register */
2977   {8, 8},                               /* cost of loading MMX registers
2978                                            in SImode and DImode */
2979   {10, 10},                             /* cost of storing MMX registers
2980                                            in SImode and DImode */
2981   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2982   {8, 8, 8, 16, 32},                    /* cost of loading SSE registers
2983                                            in 32,64,128,256 and 512-bit */
2984   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
2985                                            in 32,64,128,256 and 512-bit */
2986   8, 6,                         /* SSE->integer and integer->SSE moves */
2987   8, 6,                         /* mask->integer and integer->mask moves */
2988   {6, 6, 6},                            /* cost of loading mask register
2989                                            in QImode, HImode, SImode.  */
2990   {6, 6, 6},                    /* cost if storing mask register
2991                                            in QImode, HImode, SImode.  */
2992   2,                                    /* cost of moving mask register.  */
2993   /* End of register allocator costs.  */
2994   },
2995
2996   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2997   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2998   COSTS_N_INSNS (1),                    /* variable shift costs */
2999   COSTS_N_INSNS (1),                    /* constant shift costs */
3000   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3001    COSTS_N_INSNS (4),                   /*                               HI */
3002    COSTS_N_INSNS (3),                   /*                               SI */
3003    COSTS_N_INSNS (4),                   /*                               DI */
3004    COSTS_N_INSNS (2)},                  /*                            other */
3005   0,                                    /* cost of multiply per each bit set */
3006   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
3007    COSTS_N_INSNS (26),                  /*                          HI */
3008    COSTS_N_INSNS (42),                  /*                          SI */
3009    COSTS_N_INSNS (74),                  /*                          DI */
3010    COSTS_N_INSNS (74)},                 /*                          other */
3011   COSTS_N_INSNS (1),                    /* cost of movsx */
3012   COSTS_N_INSNS (1),                    /* cost of movzx */
3013   8,                                    /* "large" insn */
3014   17,                                   /* MOVE_RATIO */
3015   6,                                    /* CLEAR_RATIO */
3016   {6, 6, 6},                            /* cost of loading integer registers
3017                                            in QImode, HImode and SImode.
3018                                            Relative to reg-reg move (2).  */
3019   {6, 6, 6},                            /* cost of storing integer registers */
3020   {8, 8, 8, 16, 32},                    /* cost of loading SSE register
3021                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3022   {8, 8, 8, 16, 32},                    /* cost of storing SSE register
3023                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3024   {16, 16, 16, 32, 64},                 /* cost of unaligned loads.  */
3025   {16, 16, 16, 32, 64},                 /* cost of unaligned stores.  */
3026   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
3027   8,                                    /* cost of moving SSE register to integer.  */
3028   8, 8,                                 /* Gather load static, per_elt.  */
3029   8, 8,                                 /* Gather store static, per_elt.  */
3030   32,                                   /* size of l1 cache.  */
3031   256,                                  /* size of l2 cache.  */
3032   64,                                   /* size of prefetch block */
3033   6,                                    /* number of parallel prefetches */
3034   3,                                    /* Branch cost */
3035   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
3036   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
3037   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
3038   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
3039   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
3040   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
3041
3042   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3043   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3044   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
3045   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
3046   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
3047   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
3048   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
3049   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
3050   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
3051   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
3052   2, 2, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
3053   atom_memcpy,
3054   atom_memset,
3055   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
3056   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
3057   "16",                                 /* Loop alignment.  */
3058   "16:8:8",                             /* Jump alignment.  */
3059   "0:0:8",                              /* Label alignment.  */
3060   "16",                                 /* Func alignment.  */
3061   4,                                    /* Small unroll limit.  */
3062   2,                                    /* Small unroll factor.  */
3063 };
3064
3065 static stringop_algs slm_memcpy[2] = {
3066   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
3067   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
3068              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3069 static stringop_algs slm_memset[2] = {
3070   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
3071              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
3072   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
3073              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3074 static const
3075 struct processor_costs slm_cost = {
3076   {
3077   /* Start of register allocator costs.  integer->integer move cost is 2. */
3078   8,                                    /* cost for loading QImode using movzbl */
3079   {8, 8, 8},                            /* cost of loading integer registers
3080                                            in QImode, HImode and SImode.
3081                                            Relative to reg-reg move (2).  */
3082   {6, 6, 6},                            /* cost of storing integer registers */
3083   2,                                    /* cost of reg,reg fld/fst */
3084   {8, 8, 18},                           /* cost of loading fp registers
3085                                            in SFmode, DFmode and XFmode */
3086   {6, 6, 18},                           /* cost of storing fp registers
3087                                            in SFmode, DFmode and XFmode */
3088   2,                                    /* cost of moving MMX register */
3089   {8, 8},                               /* cost of loading MMX registers
3090                                            in SImode and DImode */
3091   {6, 6},                               /* cost of storing MMX registers
3092                                            in SImode and DImode */
3093   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
3094   {8, 8, 8, 16, 32},                    /* cost of loading SSE registers
3095                                            in 32,64,128,256 and 512-bit */
3096   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
3097                                            in 32,64,128,256 and 512-bit */
3098   8, 6,                         /* SSE->integer and integer->SSE moves */
3099   8, 6,                         /* mask->integer and integer->mask moves */
3100   {8, 8, 8},                    /* cost of loading mask register
3101                                            in QImode, HImode, SImode.  */
3102   {6, 6, 6},                    /* cost if storing mask register
3103                                            in QImode, HImode, SImode.  */
3104   2,                                    /* cost of moving mask register.  */
3105   /* End of register allocator costs.  */
3106   },
3107
3108   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3109   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3110   COSTS_N_INSNS (1),                    /* variable shift costs */
3111   COSTS_N_INSNS (1),                    /* constant shift costs */
3112   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3113    COSTS_N_INSNS (3),                   /*                               HI */
3114    COSTS_N_INSNS (3),                   /*                               SI */
3115    COSTS_N_INSNS (4),                   /*                               DI */
3116    COSTS_N_INSNS (2)},                  /*                            other */
3117   0,                                    /* cost of multiply per each bit set */
3118   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
3119    COSTS_N_INSNS (26),                  /*                          HI */
3120    COSTS_N_INSNS (42),                  /*                          SI */
3121    COSTS_N_INSNS (74),                  /*                          DI */
3122    COSTS_N_INSNS (74)},                 /*                          other */
3123   COSTS_N_INSNS (1),                    /* cost of movsx */
3124   COSTS_N_INSNS (1),                    /* cost of movzx */
3125   8,                                    /* "large" insn */
3126   17,                                   /* MOVE_RATIO */
3127   6,                                    /* CLEAR_RATIO */
3128   {8, 8, 8},                            /* cost of loading integer registers
3129                                            in QImode, HImode and SImode.
3130                                            Relative to reg-reg move (2).  */
3131   {6, 6, 6},                            /* cost of storing integer registers */
3132   {8, 8, 8, 16, 32},                    /* cost of loading SSE register
3133                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3134   {8, 8, 8, 16, 32},                    /* cost of storing SSE register
3135                                            in SImode, DImode and TImode.  */
3136   {16, 16, 16, 32, 64},                 /* cost of unaligned loads.  */
3137   {16, 16, 16, 32, 64},                 /* cost of unaligned stores.  */
3138   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
3139   8,                                    /* cost of moving SSE register to integer.  */
3140   8, 8,                                 /* Gather load static, per_elt.  */
3141   8, 8,                                 /* Gather store static, per_elt.  */
3142   32,                                   /* size of l1 cache.  */
3143   256,                                  /* size of l2 cache.  */
3144   64,                                   /* size of prefetch block */
3145   6,                                    /* number of parallel prefetches */
3146   3,                                    /* Branch cost */
3147   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
3148   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
3149   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
3150   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
3151   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
3152   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
3153
3154   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3155   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3156   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
3157   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
3158   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
3159   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
3160   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
3161   COSTS_N_INSNS (69),                   /* cost of DIVSD instruction.  */
3162   COSTS_N_INSNS (20),                   /* cost of SQRTSS instruction.  */
3163   COSTS_N_INSNS (35),                   /* cost of SQRTSD instruction.  */
3164   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
3165   slm_memcpy,
3166   slm_memset,
3167   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
3168   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
3169   "16",                                 /* Loop alignment.  */
3170   "16:8:8",                             /* Jump alignment.  */
3171   "0:0:8",                              /* Label alignment.  */
3172   "16",                                 /* Func alignment.  */
3173   4,                                    /* Small unroll limit.  */
3174   2,                                    /* Small unroll factor.  */
3175 };
3176
3177 static stringop_algs tremont_memcpy[2] = {
3178   {libcall,
3179    {{256, rep_prefix_1_byte, true},
3180     {256, loop, false},
3181     {-1, libcall, false}}},
3182   {libcall,
3183    {{256, rep_prefix_1_byte, true},
3184     {256, loop, false},
3185     {-1, libcall, false}}}};
3186 static stringop_algs tremont_memset[2] = {
3187   {libcall,
3188    {{256, rep_prefix_1_byte, true},
3189     {256, loop, false},
3190     {-1, libcall, false}}},
3191   {libcall,
3192    {{256, rep_prefix_1_byte, true},
3193     {256, loop, false},
3194     {-1, libcall, false}}}};
3195 static const
3196 struct processor_costs tremont_cost = {
3197   {
3198   /* Start of register allocator costs.  integer->integer move cost is 2. */
3199   6,                                 /* cost for loading QImode using movzbl */
3200   {6, 6, 6},                            /* cost of loading integer registers
3201                                            in QImode, HImode and SImode.
3202                                            Relative to reg-reg move (2).  */
3203   {6, 6, 6},                            /* cost of storing integer registers */
3204   4,                                    /* cost of reg,reg fld/fst */
3205   {6, 6, 12},                           /* cost of loading fp registers
3206                                            in SFmode, DFmode and XFmode */
3207   {6, 6, 12},                           /* cost of storing fp registers
3208                                            in SFmode, DFmode and XFmode */
3209   2,                                    /* cost of moving MMX register */
3210   {6, 6},                               /* cost of loading MMX registers
3211                                            in SImode and DImode */
3212   {6, 6},                               /* cost of storing MMX registers
3213                                            in SImode and DImode */
3214   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
3215   {6, 6, 6, 10, 15},                    /* cost of loading SSE registers
3216                                            in 32,64,128,256 and 512-bit */
3217   {6, 6, 6, 10, 15},                    /* cost of storing SSE registers
3218                                            in 32,64,128,256 and 512-bit */
3219   6, 6,                         /* SSE->integer and integer->SSE moves */
3220   6, 6,                         /* mask->integer and integer->mask moves */
3221   {6, 6, 6},                            /* cost of loading mask register
3222                                            in QImode, HImode, SImode.  */
3223   {6, 6, 6},                    /* cost if storing mask register
3224                                            in QImode, HImode, SImode.  */
3225   2,                                    /* cost of moving mask register.  */
3226   /* End of register allocator costs.  */
3227   },
3228
3229   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3230   /* Setting cost to 2 makes our current implementation of synth_mult result in
3231      use of unnecessary temporary registers causing regression on several
3232      SPECfp benchmarks.  */
3233   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3234   COSTS_N_INSNS (1),                    /* variable shift costs */
3235   COSTS_N_INSNS (1),                    /* constant shift costs */
3236   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3237    COSTS_N_INSNS (3),                   /*                               HI */
3238    COSTS_N_INSNS (3),                   /*                               SI */
3239    COSTS_N_INSNS (3),                   /*                               DI */
3240    COSTS_N_INSNS (4)},                  /*                            other */
3241   0,                                    /* cost of multiply per each bit set */
3242   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI */
3243    COSTS_N_INSNS (22),                  /*                          HI */
3244    COSTS_N_INSNS (30),                  /*                          SI */
3245    COSTS_N_INSNS (74),                  /*                          DI */
3246    COSTS_N_INSNS (74)},                 /*                          other */
3247   COSTS_N_INSNS (1),                    /* cost of movsx */
3248   COSTS_N_INSNS (1),                    /* cost of movzx */
3249   8,                                    /* "large" insn */
3250   17,                                   /* MOVE_RATIO */
3251   17,                                   /* CLEAR_RATIO */
3252   {6, 6, 6},                            /* cost of loading integer registers
3253                                            in QImode, HImode and SImode.
3254                                            Relative to reg-reg move (2).  */
3255   {6, 6, 6},                            /* cost of storing integer registers */
3256   {6, 6, 6, 10, 15},                    /* cost of loading SSE register
3257                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3258   {6, 6, 6, 10, 15},                    /* cost of storing SSE register
3259                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3260   {6, 6, 6, 10, 15},                    /* cost of unaligned loads.  */
3261   {6, 6, 6, 10, 15},                    /* cost of unaligned storess.  */
3262   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
3263   6,                                    /* cost of moving SSE register to integer.  */
3264   18, 6,                                /* Gather load static, per_elt.  */
3265   18, 6,                                /* Gather store static, per_elt.  */
3266   32,                                   /* size of l1 cache.  */
3267   512,                                  /* size of l2 cache.  */
3268   64,                                   /* size of prefetch block */
3269   6,                                    /* number of parallel prefetches */
3270   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3271      value is increased to perhaps more appropriate value of 5.  */
3272   3,                                    /* Branch cost */
3273   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
3274   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
3275   COSTS_N_INSNS (17),                   /* cost of FDIV instruction.  */
3276   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
3277   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
3278   COSTS_N_INSNS (14),                   /* cost of FSQRT instruction.  */
3279
3280   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3281   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3282   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
3283   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
3284   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
3285   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
3286   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
3287   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
3288   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
3289   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
3290   1, 4, 3, 3,                           /* reassoc int, fp, vec_int, vec_fp.  */
3291   tremont_memcpy,
3292   tremont_memset,
3293   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
3294   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
3295   "16:11:8",                            /* Loop alignment.  */
3296   "16:11:8",                            /* Jump alignment.  */
3297   "0:0:8",                              /* Label alignment.  */
3298   "16",                                 /* Func alignment.  */
3299   4,                                    /* Small unroll limit.  */
3300   2,                                    /* Small unroll factor.  */
3301 };
3302
3303 static stringop_algs intel_memcpy[2] = {
3304   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
3305   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
3306              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3307 static stringop_algs intel_memset[2] = {
3308   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
3309              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
3310   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
3311              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3312 static const
3313 struct processor_costs intel_cost = {
3314   {
3315   /* Start of register allocator costs.  integer->integer move cost is 2. */
3316   6,                                 /* cost for loading QImode using movzbl */
3317   {4, 4, 4},                            /* cost of loading integer registers
3318                                            in QImode, HImode and SImode.
3319                                            Relative to reg-reg move (2).  */
3320   {6, 6, 6},                            /* cost of storing integer registers */
3321   2,                                    /* cost of reg,reg fld/fst */
3322   {6, 6, 8},                            /* cost of loading fp registers
3323                                            in SFmode, DFmode and XFmode */
3324   {6, 6, 10},                           /* cost of storing fp registers
3325                                            in SFmode, DFmode and XFmode */
3326   2,                                    /* cost of moving MMX register */
3327   {6, 6},                               /* cost of loading MMX registers
3328                                            in SImode and DImode */
3329   {6, 6},                               /* cost of storing MMX registers
3330                                            in SImode and DImode */
3331   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM register */
3332   {6, 6, 6, 6, 6},                      /* cost of loading SSE registers
3333                                            in 32,64,128,256 and 512-bit */
3334   {6, 6, 6, 6, 6},                      /* cost of storing SSE registers
3335                                            in 32,64,128,256 and 512-bit */
3336   4, 4,                         /* SSE->integer and integer->SSE moves */
3337   4, 4,                         /* mask->integer and integer->mask moves */
3338   {4, 4, 4},                            /* cost of loading mask register
3339                                            in QImode, HImode, SImode.  */
3340   {6, 6, 6},                            /* cost if storing mask register
3341                                            in QImode, HImode, SImode.  */
3342   2,                                    /* cost of moving mask register.  */
3343   /* End of register allocator costs.  */
3344   },
3345
3346   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3347   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3348   COSTS_N_INSNS (1),                    /* variable shift costs */
3349   COSTS_N_INSNS (1),                    /* constant shift costs */
3350   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3351    COSTS_N_INSNS (3),                   /*                               HI */
3352    COSTS_N_INSNS (3),                   /*                               SI */
3353    COSTS_N_INSNS (4),                   /*                               DI */
3354    COSTS_N_INSNS (2)},                  /*                            other */
3355   0,                                    /* cost of multiply per each bit set */
3356   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
3357    COSTS_N_INSNS (26),                  /*                          HI */
3358    COSTS_N_INSNS (42),                  /*                          SI */
3359    COSTS_N_INSNS (74),                  /*                          DI */
3360    COSTS_N_INSNS (74)},                 /*                          other */
3361   COSTS_N_INSNS (1),                    /* cost of movsx */
3362   COSTS_N_INSNS (1),                    /* cost of movzx */
3363   8,                                    /* "large" insn */
3364   17,                                   /* MOVE_RATIO */
3365   6,                                    /* CLEAR_RATIO */
3366   {4, 4, 4},                            /* cost of loading integer registers
3367                                            in QImode, HImode and SImode.
3368                                            Relative to reg-reg move (2).  */
3369   {6, 6, 6},                            /* cost of storing integer registers */
3370   {6, 6, 6, 6, 6},                      /* cost of loading SSE register
3371                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3372   {6, 6, 6, 6, 6},                      /* cost of storing SSE register
3373                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3374   {10, 10, 10, 10, 10},                 /* cost of unaligned loads.  */
3375   {10, 10, 10, 10, 10},                 /* cost of unaligned loads.  */
3376   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM register */
3377   4,                                    /* cost of moving SSE register to integer.  */
3378   6, 6,                                 /* Gather load static, per_elt.  */
3379   6, 6,                                 /* Gather store static, per_elt.  */
3380   32,                                   /* size of l1 cache.  */
3381   256,                                  /* size of l2 cache.  */
3382   64,                                   /* size of prefetch block */
3383   6,                                    /* number of parallel prefetches */
3384   3,                                    /* Branch cost */
3385   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
3386   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
3387   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
3388   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
3389   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
3390   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
3391
3392   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3393   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3394   COSTS_N_INSNS (8),                    /* cost of MULSS instruction.  */
3395   COSTS_N_INSNS (8),                    /* cost of MULSD instruction.  */
3396   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
3397   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
3398   COSTS_N_INSNS (20),                   /* cost of DIVSS instruction.  */
3399   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
3400   COSTS_N_INSNS (40),                   /* cost of SQRTSS instruction.  */
3401   COSTS_N_INSNS (40),                   /* cost of SQRTSD instruction.  */
3402   1, 4, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
3403   intel_memcpy,
3404   intel_memset,
3405   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
3406   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
3407   "16",                                 /* Loop alignment.  */
3408   "16:8:8",                             /* Jump alignment.  */
3409   "0:0:8",                              /* Label alignment.  */
3410   "16",                                 /* Func alignment.  */
3411   4,                                    /* Small unroll limit.  */
3412   2,                                    /* Small unroll factor.  */
3413 };
3414
3415 /* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU.  */
3416 static stringop_algs lujiazui_memcpy[2] = {
3417   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3418                          {-1, libcall, false}}},
3419   {libcall, {{12, unrolled_loop, true}, {32, loop, false},
3420                          {6144, rep_prefix_8_byte, false},
3421                          {-1, libcall, false}}}};
3422 static stringop_algs lujiazui_memset[2] = {
3423   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3424                          {-1, libcall, false}}},
3425   {libcall, {{12, loop, true}, {32, loop, false},
3426                          {640, rep_prefix_8_byte, false},
3427                          {-1, libcall, false}}}};
3428 static const
3429 struct processor_costs lujiazui_cost = {
3430   {
3431   /* Start of register allocator costs.  integer->integer move cost is 2.  */
3432   6,                            /* cost for loading QImode using movzbl.  */
3433   {6, 6, 6},                    /* cost of loading integer registers
3434                                            in QImode, HImode and SImode.
3435                                            Relative to reg-reg move (2).  */
3436   {6, 6, 6},                    /* cost of storing integer registers.  */
3437   2,                                    /* cost of reg,reg fld/fst.  */
3438   {6, 6, 8},                    /* cost of loading fp registers
3439                                 in SFmode, DFmode and XFmode.  */
3440   {6, 6, 8},                    /* cost of storing fp registers
3441                                 in SFmode, DFmode and XFmode.  */
3442   2,                            /* cost of moving MMX register.  */
3443   {6, 6},                       /* cost of loading MMX registers
3444                                 in SImode and DImode.  */
3445   {6, 6},                       /* cost of storing MMX registers
3446                                 in SImode and DImode.  */
3447   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3448   {6, 6, 6, 10, 15},    /* cost of loading SSE registers
3449                                 in 32,64,128,256 and 512-bit.  */
3450   {6, 6, 6, 10, 15},    /* cost of storing SSE registers
3451                                 in 32,64,128,256 and 512-bit.  */
3452   6, 6,                         /* SSE->integer and integer->SSE moves.  */
3453   6, 6,                         /* mask->integer and integer->mask moves.  */
3454   {6, 6, 6},            /* cost of loading mask register
3455                                 in QImode, HImode, SImode.  */
3456   {6, 6, 6},            /* cost if storing mask register
3457                                 in QImode, HImode, SImode.  */
3458   2,                            /* cost of moving mask register.  */
3459   /* End of register allocator costs.  */
3460   },
3461
3462   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
3463   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction.  */
3464   COSTS_N_INSNS (1),                    /* variable shift costs.  */
3465   COSTS_N_INSNS (1),                    /* constant shift costs.  */
3466   {COSTS_N_INSNS (2),                   /* cost of starting multiply for QI.  */
3467    COSTS_N_INSNS (3),                   /*                               HI.  */
3468    COSTS_N_INSNS (3),                   /*                               SI.  */
3469    COSTS_N_INSNS (12),                  /*                               DI.  */
3470    COSTS_N_INSNS (14)},         /*                               other.  */
3471   0,                            /* cost of multiply per each bit set.  */
3472   {COSTS_N_INSNS (22),                  /* cost of a divide/mod for QI.  */
3473    COSTS_N_INSNS (24),                  /*                          HI.  */
3474    COSTS_N_INSNS (24),                  /*                          SI.  */
3475    COSTS_N_INSNS (150),                 /*                          DI.  */
3476    COSTS_N_INSNS (152)},                /*                          other.  */
3477   COSTS_N_INSNS (1),                    /* cost of movsx.  */
3478   COSTS_N_INSNS (1),                    /* cost of movzx.  */
3479   8,                                    /* "large" insn.  */
3480   17,                                   /* MOVE_RATIO.  */
3481   6,                                    /* CLEAR_RATIO.  */
3482   {6, 6, 6},                            /* cost of loading integer registers
3483                                            in QImode, HImode and SImode.
3484                                            Relative to reg-reg move (2).  */
3485   {6, 6, 6},                    /* cost of storing integer registers.  */
3486   {6, 6, 6, 10, 15},                    /* cost of loading SSE register
3487                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3488   {6, 6, 6, 10, 15},                    /* cost of storing SSE register
3489                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3490   {6, 6, 6, 10, 15},                    /* cost of unaligned loads.  */
3491   {6, 6, 6, 10, 15},                    /* cost of unaligned storess.  */
3492   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3493   6,                            /* cost of moving SSE register to integer.  */
3494   18, 6,                                /* Gather load static, per_elt.  */
3495   18, 6,                                /* Gather store static, per_elt.  */
3496   32,                                   /* size of l1 cache.  */
3497   4096,                                 /* size of l2 cache.  */
3498   64,                                   /* size of prefetch block.  */
3499   /* Lujiazui processor never drop prefetches, like AMD processors.  */
3500   100,                                  /* number of parallel prefetches.  */
3501   3,                                    /* Branch cost.  */
3502   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
3503   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
3504   COSTS_N_INSNS (22),                   /* cost of FDIV instruction.  */
3505   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
3506   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
3507   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
3508
3509   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3510   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3511   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
3512   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
3513   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
3514   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
3515   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
3516   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
3517   COSTS_N_INSNS (32),                   /* cost of SQRTSS instruction.  */
3518   COSTS_N_INSNS (60),                   /* cost of SQRTSD instruction.  */
3519   1, 4, 3, 3,                           /* reassoc int, fp, vec_int, vec_fp.  */
3520   lujiazui_memcpy,
3521   lujiazui_memset,
3522   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
3523   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
3524   "16:11:8",                            /* Loop alignment.  */
3525   "16:11:8",                            /* Jump alignment.  */
3526   "0:0:8",                              /* Label alignment.  */
3527   "16",                                 /* Func alignment.  */
3528   4,                                    /* Small unroll limit.  */
3529   2,                                    /* Small unroll factor.  */
3530 };
3531
3532 /* yongfeng_cost should produce code tuned for ZHAOXIN yongfeng CPU.  */
3533 static stringop_algs yongfeng_memcpy[2] = {
3534   {libcall, {{6, unrolled_loop, true}, {256, unrolled_loop, false},
3535                          {-1, libcall, false}}},
3536   {libcall, {{8, loop, false}, {512, unrolled_loop, false},
3537                          {-1, libcall, false}}}};
3538 static stringop_algs yongfeng_memset[2] = {
3539   {libcall, {{6, loop_1_byte, false}, {128, loop, false},
3540                          {-1, libcall, false}}},
3541   {libcall, {{2, rep_prefix_4_byte, false}, {64, loop, false},
3542                          {1024, vector_loop, false},
3543                          {-1, libcall, false}}}};
3544 static const
3545 struct processor_costs yongfeng_cost = {
3546   {
3547   /* Start of register allocator costs.  integer->integer move cost is 2.  */
3548   8,                            /* cost for loading QImode using movzbl.  */
3549   {8, 8, 8},                    /* cost of loading integer registers
3550                                            in QImode, HImode and SImode.
3551                                            Relative to reg-reg move (2).  */
3552   {8, 8, 8},                    /* cost of storing integer registers.  */
3553   2,                                    /* cost of reg,reg fld/fst.  */
3554   {8, 8, 8},                    /* cost of loading fp registers
3555                                 in SFmode, DFmode and XFmode.  */
3556   {8, 8, 8},                    /* cost of storing fp registers
3557                                 in SFmode, DFmode and XFmode.  */
3558   2,                            /* cost of moving MMX register.  */
3559   {8, 8},                       /* cost of loading MMX registers
3560                                 in SImode and DImode.  */
3561   {8, 8},                       /* cost of storing MMX registers
3562                                 in SImode and DImode.  */
3563   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3564   {8, 8, 8, 10, 15},    /* cost of loading SSE registers
3565                                 in 32,64,128,256 and 512-bit.  */
3566   {8, 8, 8, 10, 15},    /* cost of storing SSE registers
3567                                 in 32,64,128,256 and 512-bit.  */
3568   8, 8,                         /* SSE->integer and integer->SSE moves.  */
3569   8, 8,                         /* mask->integer and integer->mask moves.  */
3570   {8, 8, 8},            /* cost of loading mask register
3571                                 in QImode, HImode, SImode.  */
3572   {8, 8, 8},            /* cost if storing mask register
3573                                 in QImode, HImode, SImode.  */
3574   2,                            /* cost of moving mask register.  */
3575   /* End of register allocator costs.  */
3576   },
3577
3578   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
3579   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
3580   COSTS_N_INSNS (1),                    /* variable shift costs.  */
3581   COSTS_N_INSNS (1),                    /* constant shift costs.  */
3582   {COSTS_N_INSNS (2),                   /* cost of starting multiply for QI.  */
3583    COSTS_N_INSNS (3),                   /*                               HI.  */
3584    COSTS_N_INSNS (2),                   /*                               SI.  */
3585    COSTS_N_INSNS (2),                   /*                               DI.  */
3586    COSTS_N_INSNS (3)},          /*                               other.  */
3587   0,                            /* cost of multiply per each bit set.  */
3588   {COSTS_N_INSNS (8),                   /* cost of a divide/mod for QI.  */
3589    COSTS_N_INSNS (9),                   /*                          HI.  */
3590    COSTS_N_INSNS (8),                   /*                          SI.  */
3591    COSTS_N_INSNS (41),                  /*                          DI.  */
3592    COSTS_N_INSNS (41)},         /*                          other.  */
3593   COSTS_N_INSNS (1),                    /* cost of movsx.  */
3594   COSTS_N_INSNS (1),                    /* cost of movzx.  */
3595   8,                                    /* "large" insn.  */
3596   17,                                   /* MOVE_RATIO.  */
3597   6,                                    /* CLEAR_RATIO.  */
3598   {8, 8, 8},                            /* cost of loading integer registers
3599                                            in QImode, HImode and SImode.
3600                                            Relative to reg-reg move (2).  */
3601   {8, 8, 8},                    /* cost of storing integer registers.  */
3602   {8, 8, 8, 12, 15},                    /* cost of loading SSE register
3603                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3604   {8, 8, 8, 12, 15},                    /* cost of storing SSE register
3605                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3606   {8, 8, 8, 12, 15},                    /* cost of unaligned loads.  */
3607   {8, 8, 8, 12, 15},                    /* cost of unaligned storess.  */
3608   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3609   8,                            /* cost of moving SSE register to integer.  */
3610   18, 6,                                /* Gather load static, per_elt.  */
3611   18, 6,                                /* Gather store static, per_elt.  */
3612   32,                                   /* size of l1 cache.  */
3613   256,                                  /* size of l2 cache.  */
3614   64,                                   /* size of prefetch block.  */
3615   12,                                   /* number of parallel prefetches.  */
3616   3,                                    /* Branch cost.  */
3617   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
3618   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
3619   COSTS_N_INSNS (14),                   /* cost of FDIV instruction.  */
3620   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
3621   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
3622   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
3623
3624   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3625   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3626   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
3627   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
3628   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
3629   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
3630   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
3631   COSTS_N_INSNS (14),                   /* cost of DIVSD instruction.  */
3632   COSTS_N_INSNS (20),                   /* cost of SQRTSS instruction.  */
3633   COSTS_N_INSNS (35),                   /* cost of SQRTSD instruction.  */
3634   4, 4, 4, 4,                           /* reassoc int, fp, vec_int, vec_fp.  */
3635   yongfeng_memcpy,
3636   yongfeng_memset,
3637   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
3638   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
3639   "16:11:8",                            /* Loop alignment.  */
3640   "16:11:8",                            /* Jump alignment.  */
3641   "0:0:8",                              /* Label alignment.  */
3642   "16",                                 /* Func alignment.  */
3643   4,                                    /* Small unroll limit.  */
3644   2,                                    /* Small unroll factor.  */
3645 };
3646
3647 /* shijidadao_cost should produce code tuned for ZHAOXIN shijidadao CPU.  */
3648 static stringop_algs shijidadao_memcpy[2] = {
3649   {libcall, {{8, unrolled_loop, true}, {256, unrolled_loop, false},
3650                          {-1, libcall, false}}},
3651   {libcall, {{10, loop, true}, {256, unrolled_loop, false},
3652                          {-1, libcall, false}}}};
3653 static stringop_algs shijidadao_memset[2] = {
3654   {libcall, {{4, loop, true}, {128, unrolled_loop, false},
3655                          {-1, libcall, false}}},
3656   {libcall, {{1, rep_prefix_4_byte, false}, {14, loop, true},
3657                          {1024, vector_loop, false},
3658                          {-1, libcall, false}}}};
3659 static const
3660 struct processor_costs shijidadao_cost = {
3661   {
3662   /* Start of register allocator costs.  integer->integer move cost is 2.  */
3663   8,                            /* cost for loading QImode using movzbl.  */
3664   {8, 8, 8},                    /* cost of loading integer registers
3665                                            in QImode, HImode and SImode.
3666                                            Relative to reg-reg move (2).  */
3667   {8, 8, 8},                    /* cost of storing integer registers.  */
3668   2,                                    /* cost of reg,reg fld/fst.  */
3669   {8, 8, 8},                    /* cost of loading fp registers
3670                                 in SFmode, DFmode and XFmode.  */
3671   {8, 8, 8},                    /* cost of storing fp registers
3672                                 in SFmode, DFmode and XFmode.  */
3673   2,                            /* cost of moving MMX register.  */
3674   {8, 8},                       /* cost of loading MMX registers
3675                                 in SImode and DImode.  */
3676   {8, 8},                       /* cost of storing MMX registers
3677                                 in SImode and DImode.  */
3678   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3679   {8, 8, 8, 10, 15},    /* cost of loading SSE registers
3680                                 in 32,64,128,256 and 512-bit.  */
3681   {8, 8, 8, 10, 15},    /* cost of storing SSE registers
3682                                 in 32,64,128,256 and 512-bit.  */
3683   8, 8,                         /* SSE->integer and integer->SSE moves.  */
3684   8, 8,                         /* mask->integer and integer->mask moves.  */
3685   {8, 8, 8},            /* cost of loading mask register
3686                                 in QImode, HImode, SImode.  */
3687   {8, 8, 8},            /* cost if storing mask register
3688                                 in QImode, HImode, SImode.  */
3689   2,                            /* cost of moving mask register.  */
3690   /* End of register allocator costs.  */
3691   },
3692
3693   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
3694   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
3695   COSTS_N_INSNS (1),                    /* variable shift costs.  */
3696   COSTS_N_INSNS (1),                    /* constant shift costs.  */
3697   {COSTS_N_INSNS (2),                   /* cost of starting multiply for QI.  */
3698    COSTS_N_INSNS (3),                   /*                               HI.  */
3699    COSTS_N_INSNS (2),                   /*                               SI.  */
3700    COSTS_N_INSNS (2),                   /*                               DI.  */
3701    COSTS_N_INSNS (3)},          /*                               other.  */
3702   0,                            /* cost of multiply per each bit set.  */
3703   {COSTS_N_INSNS (9),                   /* cost of a divide/mod for QI.  */
3704    COSTS_N_INSNS (10),                  /*                          HI.  */
3705    COSTS_N_INSNS (9),                   /*                          SI.  */
3706    COSTS_N_INSNS (50),                  /*                          DI.  */
3707    COSTS_N_INSNS (50)},         /*                          other.  */
3708   COSTS_N_INSNS (1),                    /* cost of movsx.  */
3709   COSTS_N_INSNS (1),                    /* cost of movzx.  */
3710   8,                                    /* "large" insn.  */
3711   17,                                   /* MOVE_RATIO.  */
3712   6,                                    /* CLEAR_RATIO.  */
3713   {8, 8, 8},                            /* cost of loading integer registers
3714                                            in QImode, HImode and SImode.
3715                                            Relative to reg-reg move (2).  */
3716   {8, 8, 8},                    /* cost of storing integer registers.  */
3717   {8, 8, 8, 12, 15},                    /* cost of loading SSE register
3718                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3719   {8, 8, 8, 12, 15},                    /* cost of storing SSE register
3720                                 in 32bit, 64bit, 128bit, 256bit and 512bit.  */
3721   {8, 8, 8, 12, 15},                    /* cost of unaligned loads.  */
3722   {8, 8, 8, 12, 15},                    /* cost of unaligned storess.  */
3723   2, 3, 4,                      /* cost of moving XMM,YMM,ZMM register.  */
3724   8,                            /* cost of moving SSE register to integer.  */
3725   18, 6,                                /* Gather load static, per_elt.  */
3726   18, 6,                                /* Gather store static, per_elt.  */
3727   32,                                   /* size of l1 cache.  */
3728   256,                                  /* size of l2 cache.  */
3729   64,                                   /* size of prefetch block.  */
3730   12,                                   /* number of parallel prefetches.  */
3731   3,                                    /* Branch cost.  */
3732   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
3733   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
3734   COSTS_N_INSNS (13),                   /* cost of FDIV instruction.  */
3735   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
3736   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
3737   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
3738
3739   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3740   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3741   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
3742   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
3743   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
3744   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
3745   COSTS_N_INSNS (11),                   /* cost of DIVSS instruction.  */
3746   COSTS_N_INSNS (14),                   /* cost of DIVSD instruction.  */
3747   COSTS_N_INSNS (11),                   /* cost of SQRTSS instruction.  */
3748   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
3749   4, 4, 4, 4,                           /* reassoc int, fp, vec_int, vec_fp.  */
3750   shijidadao_memcpy,
3751   shijidadao_memset,
3752   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
3753   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
3754   "16:11:8",                            /* Loop alignment.  */
3755   "16:11:8",                            /* Jump alignment.  */
3756   "0:0:8",                              /* Label alignment.  */
3757   "16",                         /* Func alignment.  */
3758   4,                                    /* Small unroll limit.  */
3759   2,                                    /* Small unroll factor.  */
3760 };
3761
3762
3763
3764 /* Generic should produce code tuned for Core-i7 (and newer chips)
3765    and btver1 (and newer chips).  */
3766
3767 static stringop_algs generic_memcpy[2] = {
3768   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3769              {-1, libcall, false}}},
3770   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
3771              {-1, libcall, false}}}};
3772 static stringop_algs generic_memset[2] = {
3773   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3774              {-1, libcall, false}}},
3775   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
3776              {-1, libcall, false}}}};
3777 static const
3778 struct processor_costs generic_cost = {
3779   {
3780   /* Start of register allocator costs.  integer->integer move cost is 2. */
3781   6,                                 /* cost for loading QImode using movzbl */
3782   {6, 6, 6},                            /* cost of loading integer registers
3783                                            in QImode, HImode and SImode.
3784                                            Relative to reg-reg move (2).  */
3785   {6, 6, 6},                            /* cost of storing integer registers */
3786   4,                                    /* cost of reg,reg fld/fst */
3787   {6, 6, 12},                           /* cost of loading fp registers
3788                                            in SFmode, DFmode and XFmode */
3789   {6, 6, 12},                           /* cost of storing fp registers
3790                                            in SFmode, DFmode and XFmode */
3791   2,                                    /* cost of moving MMX register */
3792   {6, 6},                               /* cost of loading MMX registers
3793                                            in SImode and DImode */
3794   {6, 6},                               /* cost of storing MMX registers
3795                                            in SImode and DImode */
3796   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
3797   {6, 6, 6, 10, 15},                    /* cost of loading SSE registers
3798                                            in 32,64,128,256 and 512-bit */
3799   {6, 6, 6, 10, 15},                    /* cost of storing SSE registers
3800                                            in 32,64,128,256 and 512-bit */
3801   6, 6,                         /* SSE->integer and integer->SSE moves */
3802   6, 6,                         /* mask->integer and integer->mask moves */
3803   {6, 6, 6},                            /* cost of loading mask register
3804                                            in QImode, HImode, SImode.  */
3805   {6, 6, 6},                    /* cost if storing mask register
3806                                            in QImode, HImode, SImode.  */
3807   2,                                    /* cost of moving mask register.  */
3808   /* End of register allocator costs.  */
3809   },
3810
3811   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3812   /* Setting cost to 2 makes our current implementation of synth_mult result in
3813      use of unnecessary temporary registers causing regression on several
3814      SPECfp benchmarks.  */
3815   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3816   COSTS_N_INSNS (1),                    /* variable shift costs */
3817   COSTS_N_INSNS (1),                    /* constant shift costs */
3818   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3819    COSTS_N_INSNS (3),                   /*                               HI */
3820    COSTS_N_INSNS (3),                   /*                               SI */
3821    COSTS_N_INSNS (3),                   /*                               DI */
3822    COSTS_N_INSNS (4)},                  /*                            other */
3823   0,                                    /* cost of multiply per each bit set */
3824   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI */
3825    COSTS_N_INSNS (22),                  /*                          HI */
3826    COSTS_N_INSNS (30),                  /*                          SI */
3827    COSTS_N_INSNS (74),                  /*                          DI */
3828    COSTS_N_INSNS (74)},                 /*                          other */
3829   COSTS_N_INSNS (1),                    /* cost of movsx */
3830   COSTS_N_INSNS (1),                    /* cost of movzx */
3831   8,                                    /* "large" insn */
3832   17,                                   /* MOVE_RATIO */
3833   6,                                    /* CLEAR_RATIO */
3834   {6, 6, 6},                            /* cost of loading integer registers
3835                                            in QImode, HImode and SImode.
3836                                            Relative to reg-reg move (2).  */
3837   {6, 6, 6},                            /* cost of storing integer registers */
3838   {6, 6, 6, 10, 15},                    /* cost of loading SSE register
3839                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3840   {6, 6, 6, 10, 15},                    /* cost of storing SSE register
3841                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3842   {6, 6, 6, 10, 15},                    /* cost of unaligned loads.  */
3843   {6, 6, 6, 10, 15},                    /* cost of unaligned storess.  */
3844   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
3845   6,                                    /* cost of moving SSE register to integer.  */
3846   18, 6,                                /* Gather load static, per_elt.  */
3847   18, 6,                                /* Gather store static, per_elt.  */
3848   32,                                   /* size of l1 cache.  */
3849   512,                                  /* size of l2 cache.  */
3850   64,                                   /* size of prefetch block */
3851   6,                                    /* number of parallel prefetches */
3852   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3853      value is increased to perhaps more appropriate value of 5.  */
3854   3,                                    /* Branch cost */
3855   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
3856   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
3857   COSTS_N_INSNS (17),                   /* cost of FDIV instruction.  */
3858   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
3859   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
3860   COSTS_N_INSNS (14),                   /* cost of FSQRT instruction.  */
3861
3862   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3863   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3864   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
3865   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
3866   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
3867   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
3868   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
3869   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
3870   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
3871   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
3872   1, 4, 3, 3,                           /* reassoc int, fp, vec_int, vec_fp.  */
3873   generic_memcpy,
3874   generic_memset,
3875   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
3876   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
3877   "16",                                 /* Loop alignment.  */
3878   "16:11:8",                            /* Jump alignment.  */
3879   "0:0:8",                              /* Label alignment.  */
3880   "16",                                 /* Func alignment.  */
3881   4,                                    /* Small unroll limit.  */
3882   2,                                    /* Small unroll factor.  */
3883 };
3884
3885 /* core_cost should produce code tuned for Core familly of CPUs.  */
3886 static stringop_algs core_memcpy[2] = {
3887   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
3888   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
3889              {-1, libcall, false}}}};
3890 static stringop_algs core_memset[2] = {
3891   {libcall, {{6, loop_1_byte, true},
3892              {24, loop, true},
3893              {8192, rep_prefix_4_byte, true},
3894              {-1, libcall, false}}},
3895   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
3896              {-1, libcall, false}}}};
3897
3898 static const
3899 struct processor_costs core_cost = {
3900   {
3901   /* Start of register allocator costs.  integer->integer move cost is 2. */
3902   6,                                 /* cost for loading QImode using movzbl */
3903   {4, 4, 4},                            /* cost of loading integer registers
3904                                            in QImode, HImode and SImode.
3905                                            Relative to reg-reg move (2).  */
3906   {6, 6, 6},                            /* cost of storing integer registers */
3907   2,                                    /* cost of reg,reg fld/fst */
3908   {6, 6, 8},                            /* cost of loading fp registers
3909                                            in SFmode, DFmode and XFmode */
3910   {6, 6, 10},                           /* cost of storing fp registers
3911                                            in SFmode, DFmode and XFmode */
3912   2,                                    /* cost of moving MMX register */
3913   {6, 6},                               /* cost of loading MMX registers
3914                                            in SImode and DImode */
3915   {6, 6},                               /* cost of storing MMX registers
3916                                            in SImode and DImode */
3917   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
3918   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
3919                                            in 32,64,128,256 and 512-bit */
3920   {6, 6, 6, 6, 12},                     /* cost of storing SSE registers
3921                                            in 32,64,128,256 and 512-bit */
3922   6, 6,                         /* SSE->integer and integer->SSE moves */
3923   6, 6,                         /* mask->integer and integer->mask moves */
3924   {4, 4, 4},                            /* cost of loading mask register
3925                                            in QImode, HImode, SImode.  */
3926   {6, 6, 6},                            /* cost if storing mask register
3927                                            in QImode, HImode, SImode.  */
3928   2,                                    /* cost of moving mask register.  */
3929   /* End of register allocator costs.  */
3930   },
3931
3932   COSTS_N_INSNS (1),                    /* cost of an add instruction */
3933   /* On all chips taken into consideration lea is 2 cycles and more.  With
3934      this cost however our current implementation of synth_mult results in
3935      use of unnecessary temporary registers causing regression on several
3936      SPECfp benchmarks.  */
3937   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
3938   COSTS_N_INSNS (1),                    /* variable shift costs */
3939   COSTS_N_INSNS (1),                    /* constant shift costs */
3940   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
3941    COSTS_N_INSNS (4),                   /*                               HI */
3942    COSTS_N_INSNS (3),                   /*                               SI */
3943    /* Here we tune for Sandybridge or newer.  */
3944    COSTS_N_INSNS (3),                   /*                               DI */
3945    COSTS_N_INSNS (3)},                  /*                            other */
3946   0,                                    /* cost of multiply per each bit set */
3947   /* Expanding div/mod currently doesn't consider parallelism. So the cost
3948      model is not realistic. We compensate by increasing the latencies a bit.  */
3949   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
3950    COSTS_N_INSNS (11),                  /*                          HI */
3951    COSTS_N_INSNS (14),                  /*                          SI */
3952    COSTS_N_INSNS (81),                  /*                          DI */
3953    COSTS_N_INSNS (81)},                 /*                          other */
3954   COSTS_N_INSNS (1),                    /* cost of movsx */
3955   COSTS_N_INSNS (1),                    /* cost of movzx */
3956   8,                                    /* "large" insn */
3957   17,                                   /* MOVE_RATIO */
3958   6,                                    /* CLEAR_RATIO */
3959   {4, 4, 4},                            /* cost of loading integer registers
3960                                            in QImode, HImode and SImode.
3961                                            Relative to reg-reg move (2).  */
3962   {6, 6, 6},                            /* cost of storing integer registers */
3963   {6, 6, 6, 6, 12},                     /* cost of loading SSE register
3964                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3965   {6, 6, 6, 6, 12},                     /* cost of storing SSE register
3966                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
3967   {6, 6, 6, 6, 12},                     /* cost of unaligned loads.  */
3968   {6, 6, 6, 6, 12},                     /* cost of unaligned stores.  */
3969   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
3970   2,                                    /* cost of moving SSE register to integer.  */
3971   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
3972      rec. throughput 6.
3973      So 5 uops statically and one uops per load.  */
3974   10, 6,                                /* Gather load static, per_elt.  */
3975   10, 6,                                /* Gather store static, per_elt.  */
3976   64,                                   /* size of l1 cache.  */
3977   512,                                  /* size of l2 cache.  */
3978   64,                                   /* size of prefetch block */
3979   6,                                    /* number of parallel prefetches */
3980   /* FIXME perhaps more appropriate value is 5.  */
3981   3,                                    /* Branch cost */
3982   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
3983   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
3984   /* 10-24 */
3985   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
3986   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
3987   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
3988   COSTS_N_INSNS (23),                   /* cost of FSQRT instruction.  */
3989
3990   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
3991   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
3992   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
3993   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
3994   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
3995   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
3996   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
3997   COSTS_N_INSNS (32),                   /* cost of DIVSD instruction.  */
3998   COSTS_N_INSNS (30),                   /* cost of SQRTSS instruction.  */
3999   COSTS_N_INSNS (58),                   /* cost of SQRTSD instruction.  */
4000   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
4001   core_memcpy,
4002   core_memset,
4003   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
4004   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
4005   "16:11:8",                            /* Loop alignment.  */
4006   "16:11:8",                            /* Jump alignment.  */
4007   "0:0:8",                              /* Label alignment.  */
4008   "16",                                 /* Func alignment.  */
4009   4,                                    /* Small unroll limit.  */
4010   2,                                    /* Small unroll factor.  */
4011 };
4012