gcc/config/i386/x86-tune-costs.h

   1 /* Costs of operations of individual x86 CPUs.
   2    Copyright (C) 1988-2017 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3, or (at your option)
   9 any later version.
  10
  11 GCC is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 Under Section 7 of GPL version 3, you are granted additional
  17 permissions described in the GCC Runtime Library Exception, version
  18 3.1, as published by the Free Software Foundation.
  19
  20 You should have received a copy of the GNU General Public License and
  21 a copy of the GCC Runtime Library Exception along with this program;
  22 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23 <http://www.gnu.org/licenses/>.  */
  24 /* Processor costs (relative to an add) */
  25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
  26 #define COSTS_N_BYTES(N) ((N) * 2)
  27
  28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
  29
  30 static stringop_algs ix86_size_memcpy[2] = {
  31   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  32   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  33 static stringop_algs ix86_size_memset[2] = {
  34   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  35   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  36
  37 const
  38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
  39   COSTS_N_BYTES (2),                    /* cost of an add instruction */
  40   COSTS_N_BYTES (3),                    /* cost of a lea instruction */
  41   COSTS_N_BYTES (2),                    /* variable shift costs */
  42   COSTS_N_BYTES (3),                    /* constant shift costs */
  43   {COSTS_N_BYTES (3),                   /* cost of starting multiply for QI */
  44    COSTS_N_BYTES (3),                   /*                               HI */
  45    COSTS_N_BYTES (3),                   /*                               SI */
  46    COSTS_N_BYTES (3),                   /*                               DI */
  47    COSTS_N_BYTES (5)},                  /*                            other */
  48   0,                                    /* cost of multiply per each bit set */
  49   {COSTS_N_BYTES (3),                   /* cost of a divide/mod for QI */
  50    COSTS_N_BYTES (3),                   /*                          HI */
  51    COSTS_N_BYTES (3),                   /*                          SI */
  52    COSTS_N_BYTES (3),                   /*                          DI */
  53    COSTS_N_BYTES (5)},                  /*                          other */
  54   COSTS_N_BYTES (3),                    /* cost of movsx */
  55   COSTS_N_BYTES (3),                    /* cost of movzx */
  56   0,                                    /* "large" insn */
  57   2,                                    /* MOVE_RATIO */
  58
  59   /* All move costs are relative to integer->integer move times 2. */
  60   2,                                 /* cost for loading QImode using movzbl */
  61   {2, 2, 2},                            /* cost of loading integer registers
  62                                            in QImode, HImode and SImode.
  63                                            Relative to reg-reg move (2).  */
  64   {2, 2, 2},                            /* cost of storing integer registers */
  65   2,                                    /* cost of reg,reg fld/fst */
  66   {2, 2, 2},                            /* cost of loading fp registers
  67                                            in SFmode, DFmode and XFmode */
  68   {2, 2, 2},                            /* cost of storing fp registers
  69                                            in SFmode, DFmode and XFmode */
  70   3,                                    /* cost of moving MMX register */
  71   {3, 3},                               /* cost of loading MMX registers
  72                                            in SImode and DImode */
  73   {3, 3},                               /* cost of storing MMX registers
  74                                            in SImode and DImode */
  75   3, 3, 3,                              /* cost of moving XMM,YMM,ZMM register */
  76   {3, 3, 3, 3, 3},                      /* cost of loading SSE registers
  77                                            in 32,64,128,256 and 512-bit */
  78   {3, 3, 3, 3, 3},                      /* cost of unaligned SSE load
  79                                            in 128bit, 256bit and 512bit */
  80   {3, 3, 3, 3, 3},                      /* cost of storing SSE registers
  81                                            in 32,64,128,256 and 512-bit */
  82   {3, 3, 3, 3, 3},                              /* cost of unaligned SSE store
  83                                            in 128bit, 256bit and 512bit */
  84   3, 3,                                 /* SSE->integer and integer->SSE moves */
  85   5, 0,                                 /* Gather load static, per_elt.  */
  86   5, 0,                                 /* Gather store static, per_elt.  */
  87   0,                                    /* size of l1 cache  */
  88   0,                                    /* size of l2 cache  */
  89   0,                                    /* size of prefetch block */
  90   0,                                    /* number of parallel prefetches */
  91   2,                                    /* Branch cost */
  92   COSTS_N_BYTES (2),                    /* cost of FADD and FSUB insns.  */
  93   COSTS_N_BYTES (2),                    /* cost of FMUL instruction.  */
  94   COSTS_N_BYTES (2),                    /* cost of FDIV instruction.  */
  95   COSTS_N_BYTES (2),                    /* cost of FABS instruction.  */
  96   COSTS_N_BYTES (2),                    /* cost of FCHS instruction.  */
  97   COSTS_N_BYTES (2),                    /* cost of FSQRT instruction.  */
  98
  99   COSTS_N_BYTES (2),                    /* cost of cheap SSE instruction.  */
 100   COSTS_N_BYTES (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 101   COSTS_N_BYTES (2),                    /* cost of MULSS instruction.  */
 102   COSTS_N_BYTES (2),                    /* cost of MULSD instruction.  */
 103   COSTS_N_BYTES (2),                    /* cost of FMA SS instruction.  */
 104   COSTS_N_BYTES (2),                    /* cost of FMA SD instruction.  */
 105   COSTS_N_BYTES (2),                    /* cost of DIVSS instruction.  */
 106   COSTS_N_BYTES (2),                    /* cost of DIVSD instruction.  */
 107   COSTS_N_BYTES (2),                    /* cost of SQRTSS instruction.  */
 108   COSTS_N_BYTES (2),                    /* cost of SQRTSD instruction.  */
 109   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 110   ix86_size_memcpy,
 111   ix86_size_memset,
 112   COSTS_N_BYTES (1),                    /* cond_taken_branch_cost.  */
 113   COSTS_N_BYTES (1),                    /* cond_not_taken_branch_cost.  */
 114 };
 115
 116 /* Processor costs (relative to an add) */
 117 static stringop_algs i386_memcpy[2] = {
 118   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 119   DUMMY_STRINGOP_ALGS};
 120 static stringop_algs i386_memset[2] = {
 121   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 122   DUMMY_STRINGOP_ALGS};
 123
 124 static const
 125 struct processor_costs i386_cost = {    /* 386 specific costs */
 126   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 127   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 128   COSTS_N_INSNS (3),                    /* variable shift costs */
 129   COSTS_N_INSNS (2),                    /* constant shift costs */
 130   {COSTS_N_INSNS (6),                   /* cost of starting multiply for QI */
 131    COSTS_N_INSNS (6),                   /*                               HI */
 132    COSTS_N_INSNS (6),                   /*                               SI */
 133    COSTS_N_INSNS (6),                   /*                               DI */
 134    COSTS_N_INSNS (6)},                  /*                            other */
 135   COSTS_N_INSNS (1),                    /* cost of multiply per each bit set */
 136   {COSTS_N_INSNS (23),                  /* cost of a divide/mod for QI */
 137    COSTS_N_INSNS (23),                  /*                          HI */
 138    COSTS_N_INSNS (23),                  /*                          SI */
 139    COSTS_N_INSNS (23),                  /*                          DI */
 140    COSTS_N_INSNS (23)},                 /*                          other */
 141   COSTS_N_INSNS (3),                    /* cost of movsx */
 142   COSTS_N_INSNS (2),                    /* cost of movzx */
 143   15,                                   /* "large" insn */
 144   3,                                    /* MOVE_RATIO */
 145
 146   /* All move costs are relative to integer->integer move times 2 and thus
 147      they are latency*2. */
 148   4,                                 /* cost for loading QImode using movzbl */
 149   {2, 4, 2},                            /* cost of loading integer registers
 150                                            in QImode, HImode and SImode.
 151                                            Relative to reg-reg move (2).  */
 152   {2, 4, 2},                            /* cost of storing integer registers */
 153   2,                                    /* cost of reg,reg fld/fst */
 154   {8, 8, 8},                            /* cost of loading fp registers
 155                                            in SFmode, DFmode and XFmode */
 156   {8, 8, 8},                            /* cost of storing fp registers
 157                                            in SFmode, DFmode and XFmode */
 158   2,                                    /* cost of moving MMX register */
 159   {4, 8},                               /* cost of loading MMX registers
 160                                            in SImode and DImode */
 161   {4, 8},                               /* cost of storing MMX registers
 162                                            in SImode and DImode */
 163   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 164   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 165                                            in 32,64,128,256 and 512-bit */
 166   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 167   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 168                                            in 32,64,128,256 and 512-bit */
 169   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 170   3, 3,                                 /* SSE->integer and integer->SSE moves */
 171   4, 4,                                 /* Gather load static, per_elt.  */
 172   4, 4,                                 /* Gather store static, per_elt.  */
 173   0,                                    /* size of l1 cache  */
 174   0,                                    /* size of l2 cache  */
 175   0,                                    /* size of prefetch block */
 176   0,                                    /* number of parallel prefetches */
 177   1,                                    /* Branch cost */
 178   COSTS_N_INSNS (23),                   /* cost of FADD and FSUB insns.  */
 179   COSTS_N_INSNS (27),                   /* cost of FMUL instruction.  */
 180   COSTS_N_INSNS (88),                   /* cost of FDIV instruction.  */
 181   COSTS_N_INSNS (22),                   /* cost of FABS instruction.  */
 182   COSTS_N_INSNS (24),                   /* cost of FCHS instruction.  */
 183   COSTS_N_INSNS (122),                  /* cost of FSQRT instruction.  */
 184
 185   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 186   COSTS_N_INSNS (23),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
 187   COSTS_N_INSNS (27),                   /* cost of MULSS instruction.  */
 188   COSTS_N_INSNS (27),                   /* cost of MULSD instruction.  */
 189   COSTS_N_INSNS (27),                   /* cost of FMA SS instruction.  */
 190   COSTS_N_INSNS (27),                   /* cost of FMA SD instruction.  */
 191   COSTS_N_INSNS (88),                   /* cost of DIVSS instruction.  */
 192   COSTS_N_INSNS (88),                   /* cost of DIVSD instruction.  */
 193   COSTS_N_INSNS (122),                  /* cost of SQRTSS instruction.  */
 194   COSTS_N_INSNS (122),                  /* cost of SQRTSD instruction.  */
 195   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 196   i386_memcpy,
 197   i386_memset,
 198   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 199   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 200 };
 201
 202 static stringop_algs i486_memcpy[2] = {
 203   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 204   DUMMY_STRINGOP_ALGS};
 205 static stringop_algs i486_memset[2] = {
 206   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 207   DUMMY_STRINGOP_ALGS};
 208
 209 static const
 210 struct processor_costs i486_cost = {    /* 486 specific costs */
 211   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 212   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 213   COSTS_N_INSNS (3),                    /* variable shift costs */
 214   COSTS_N_INSNS (2),                    /* constant shift costs */
 215   {COSTS_N_INSNS (12),                  /* cost of starting multiply for QI */
 216    COSTS_N_INSNS (12),                  /*                               HI */
 217    COSTS_N_INSNS (12),                  /*                               SI */
 218    COSTS_N_INSNS (12),                  /*                               DI */
 219    COSTS_N_INSNS (12)},                 /*                            other */
 220   1,                                    /* cost of multiply per each bit set */
 221   {COSTS_N_INSNS (40),                  /* cost of a divide/mod for QI */
 222    COSTS_N_INSNS (40),                  /*                          HI */
 223    COSTS_N_INSNS (40),                  /*                          SI */
 224    COSTS_N_INSNS (40),                  /*                          DI */
 225    COSTS_N_INSNS (40)},                 /*                          other */
 226   COSTS_N_INSNS (3),                    /* cost of movsx */
 227   COSTS_N_INSNS (2),                    /* cost of movzx */
 228   15,                                   /* "large" insn */
 229   3,                                    /* MOVE_RATIO */
 230
 231   /* All move costs are relative to integer->integer move times 2 and thus
 232      they are latency*2. */
 233   4,                                 /* cost for loading QImode using movzbl */
 234   {2, 4, 2},                            /* cost of loading integer registers
 235                                            in QImode, HImode and SImode.
 236                                            Relative to reg-reg move (2).  */
 237   {2, 4, 2},                            /* cost of storing integer registers */
 238   2,                                    /* cost of reg,reg fld/fst */
 239   {8, 8, 8},                            /* cost of loading fp registers
 240                                            in SFmode, DFmode and XFmode */
 241   {8, 8, 8},                            /* cost of storing fp registers
 242                                            in SFmode, DFmode and XFmode */
 243   2,                                    /* cost of moving MMX register */
 244   {4, 8},                               /* cost of loading MMX registers
 245                                            in SImode and DImode */
 246   {4, 8},                               /* cost of storing MMX registers
 247                                            in SImode and DImode */
 248   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 249   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 250                                            in 32,64,128,256 and 512-bit */
 251   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 252   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 253                                            in 32,64,128,256 and 512-bit */
 254   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 255   3, 3,                                 /* SSE->integer and integer->SSE moves */
 256   4, 4,                                 /* Gather load static, per_elt.  */
 257   4, 4,                                 /* Gather store static, per_elt.  */
 258   4,                                    /* size of l1 cache.  486 has 8kB cache
 259                                            shared for code and data, so 4kB is
 260                                            not really precise.  */
 261   4,                                    /* size of l2 cache  */
 262   0,                                    /* size of prefetch block */
 263   0,                                    /* number of parallel prefetches */
 264   1,                                    /* Branch cost */
 265   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
 266   COSTS_N_INSNS (16),                   /* cost of FMUL instruction.  */
 267   COSTS_N_INSNS (73),                   /* cost of FDIV instruction.  */
 268   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
 269   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
 270   COSTS_N_INSNS (83),                   /* cost of FSQRT instruction.  */
 271
 272   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 273   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 274   COSTS_N_INSNS (16),                   /* cost of MULSS instruction.  */
 275   COSTS_N_INSNS (16),                   /* cost of MULSD instruction.  */
 276   COSTS_N_INSNS (16),                   /* cost of FMA SS instruction.  */
 277   COSTS_N_INSNS (16),                   /* cost of FMA SD instruction.  */
 278   COSTS_N_INSNS (73),                   /* cost of DIVSS instruction.  */
 279   COSTS_N_INSNS (74),                   /* cost of DIVSD instruction.  */
 280   COSTS_N_INSNS (83),                   /* cost of SQRTSS instruction.  */
 281   COSTS_N_INSNS (83),                   /* cost of SQRTSD instruction.  */
 282   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 283   i486_memcpy,
 284   i486_memset,
 285   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 286   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 287 };
 288
 289 static stringop_algs pentium_memcpy[2] = {
 290   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 291   DUMMY_STRINGOP_ALGS};
 292 static stringop_algs pentium_memset[2] = {
 293   {libcall, {{-1, rep_prefix_4_byte, false}}},
 294   DUMMY_STRINGOP_ALGS};
 295
 296 static const
 297 struct processor_costs pentium_cost = {
 298   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 299   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 300   COSTS_N_INSNS (4),                    /* variable shift costs */
 301   COSTS_N_INSNS (1),                    /* constant shift costs */
 302   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 303    COSTS_N_INSNS (11),                  /*                               HI */
 304    COSTS_N_INSNS (11),                  /*                               SI */
 305    COSTS_N_INSNS (11),                  /*                               DI */
 306    COSTS_N_INSNS (11)},                 /*                            other */
 307   0,                                    /* cost of multiply per each bit set */
 308   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 309    COSTS_N_INSNS (25),                  /*                          HI */
 310    COSTS_N_INSNS (25),                  /*                          SI */
 311    COSTS_N_INSNS (25),                  /*                          DI */
 312    COSTS_N_INSNS (25)},                 /*                          other */
 313   COSTS_N_INSNS (3),                    /* cost of movsx */
 314   COSTS_N_INSNS (2),                    /* cost of movzx */
 315   8,                                    /* "large" insn */
 316   6,                                    /* MOVE_RATIO */
 317
 318   /* All move costs are relative to integer->integer move times 2 and thus
 319      they are latency*2. */
 320   6,                                 /* cost for loading QImode using movzbl */
 321   {2, 4, 2},                            /* cost of loading integer registers
 322                                            in QImode, HImode and SImode.
 323                                            Relative to reg-reg move (2).  */
 324   {2, 4, 2},                            /* cost of storing integer registers */
 325   2,                                    /* cost of reg,reg fld/fst */
 326   {2, 2, 6},                            /* cost of loading fp registers
 327                                            in SFmode, DFmode and XFmode */
 328   {4, 4, 6},                            /* cost of storing fp registers
 329                                            in SFmode, DFmode and XFmode */
 330   8,                                    /* cost of moving MMX register */
 331   {8, 8},                               /* cost of loading MMX registers
 332                                            in SImode and DImode */
 333   {8, 8},                               /* cost of storing MMX registers
 334                                            in SImode and DImode */
 335   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 336   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 337                                            in 32,64,128,256 and 512-bit */
 338   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 339   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 340                                            in 32,64,128,256 and 512-bit */
 341   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 342   3, 3,                                 /* SSE->integer and integer->SSE moves */
 343   4, 4,                                 /* Gather load static, per_elt.  */
 344   4, 4,                                 /* Gather store static, per_elt.  */
 345   8,                                    /* size of l1 cache.  */
 346   8,                                    /* size of l2 cache  */
 347   0,                                    /* size of prefetch block */
 348   0,                                    /* number of parallel prefetches */
 349   2,                                    /* Branch cost */
 350   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 351   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 352   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 353   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 354   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 355   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 356
 357   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 358   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 359   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
 360   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
 361   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
 362   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
 363   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
 364   COSTS_N_INSNS (39),                   /* cost of DIVSD instruction.  */
 365   COSTS_N_INSNS (70),                   /* cost of SQRTSS instruction.  */
 366   COSTS_N_INSNS (70),                   /* cost of SQRTSD instruction.  */
 367   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 368   pentium_memcpy,
 369   pentium_memset,
 370   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 371   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 372 };
 373
 374 static const
 375 struct processor_costs lakemont_cost = {
 376   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 377   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
 378   COSTS_N_INSNS (1),                    /* variable shift costs */
 379   COSTS_N_INSNS (1),                    /* constant shift costs */
 380   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 381    COSTS_N_INSNS (11),                  /*                               HI */
 382    COSTS_N_INSNS (11),                  /*                               SI */
 383    COSTS_N_INSNS (11),                  /*                               DI */
 384    COSTS_N_INSNS (11)},                 /*                            other */
 385   0,                                    /* cost of multiply per each bit set */
 386   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 387    COSTS_N_INSNS (25),                  /*                          HI */
 388    COSTS_N_INSNS (25),                  /*                          SI */
 389    COSTS_N_INSNS (25),                  /*                          DI */
 390    COSTS_N_INSNS (25)},                 /*                          other */
 391   COSTS_N_INSNS (3),                    /* cost of movsx */
 392   COSTS_N_INSNS (2),                    /* cost of movzx */
 393   8,                                    /* "large" insn */
 394   17,                                   /* MOVE_RATIO */
 395
 396   /* All move costs are relative to integer->integer move times 2 and thus
 397      they are latency*2. */
 398   6,                                 /* cost for loading QImode using movzbl */
 399   {2, 4, 2},                            /* cost of loading integer registers
 400                                            in QImode, HImode and SImode.
 401                                            Relative to reg-reg move (2).  */
 402   {2, 4, 2},                            /* cost of storing integer registers */
 403   2,                                    /* cost of reg,reg fld/fst */
 404   {2, 2, 6},                            /* cost of loading fp registers
 405                                            in SFmode, DFmode and XFmode */
 406   {4, 4, 6},                            /* cost of storing fp registers
 407                                            in SFmode, DFmode and XFmode */
 408   8,                                    /* cost of moving MMX register */
 409   {8, 8},                               /* cost of loading MMX registers
 410                                            in SImode and DImode */
 411   {8, 8},                               /* cost of storing MMX registers
 412                                            in SImode and DImode */
 413   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 414   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 415                                            in 32,64,128,256 and 512-bit */
 416   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 417   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 418                                            in 32,64,128,256 and 512-bit */
 419   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 420   3, 3,                                 /* SSE->integer and integer->SSE moves */
 421   4, 4,                                 /* Gather load static, per_elt.  */
 422   4, 4,                                 /* Gather store static, per_elt.  */
 423   8,                                    /* size of l1 cache.  */
 424   8,                                    /* size of l2 cache  */
 425   0,                                    /* size of prefetch block */
 426   0,                                    /* number of parallel prefetches */
 427   2,                                    /* Branch cost */
 428   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 429   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 430   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 431   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 432   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 433   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 434
 435   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 436   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 437   COSTS_N_INSNS (5),                    /* cost of MULSS instruction.  */
 438   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
 439   COSTS_N_INSNS (10),                   /* cost of FMA SS instruction.  */
 440   COSTS_N_INSNS (10),                   /* cost of FMA SD instruction.  */
 441   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
 442   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
 443   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 444   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
 445   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 446   pentium_memcpy,
 447   pentium_memset,
 448   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 449   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 450 };
 451
 452 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
 453    (we ensure the alignment).  For small blocks inline loop is still a
 454    noticeable win, for bigger blocks either rep movsl or rep movsb is
 455    way to go.  Rep movsb has apparently more expensive startup time in CPU,
 456    but after 4K the difference is down in the noise.  */
 457 static stringop_algs pentiumpro_memcpy[2] = {
 458   {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
 459                        {8192, rep_prefix_4_byte, false},
 460                        {-1, rep_prefix_1_byte, false}}},
 461   DUMMY_STRINGOP_ALGS};
 462 static stringop_algs pentiumpro_memset[2] = {
 463   {rep_prefix_4_byte, {{1024, unrolled_loop, false},
 464                        {8192, rep_prefix_4_byte, false},
 465                        {-1, libcall, false}}},
 466   DUMMY_STRINGOP_ALGS};
 467 static const
 468 struct processor_costs pentiumpro_cost = {
 469   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 470   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 471   COSTS_N_INSNS (1),                    /* variable shift costs */
 472   COSTS_N_INSNS (1),                    /* constant shift costs */
 473   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
 474    COSTS_N_INSNS (4),                   /*                               HI */
 475    COSTS_N_INSNS (4),                   /*                               SI */
 476    COSTS_N_INSNS (4),                   /*                               DI */
 477    COSTS_N_INSNS (4)},                  /*                            other */
 478   0,                                    /* cost of multiply per each bit set */
 479   {COSTS_N_INSNS (17),                  /* cost of a divide/mod for QI */
 480    COSTS_N_INSNS (17),                  /*                          HI */
 481    COSTS_N_INSNS (17),                  /*                          SI */
 482    COSTS_N_INSNS (17),                  /*                          DI */
 483    COSTS_N_INSNS (17)},                 /*                          other */
 484   COSTS_N_INSNS (1),                    /* cost of movsx */
 485   COSTS_N_INSNS (1),                    /* cost of movzx */
 486   8,                                    /* "large" insn */
 487   6,                                    /* MOVE_RATIO */
 488
 489   /* All move costs are relative to integer->integer move times 2 and thus
 490      they are latency*2. */
 491   2,                                 /* cost for loading QImode using movzbl */
 492   {4, 4, 4},                            /* cost of loading integer registers
 493                                            in QImode, HImode and SImode.
 494                                            Relative to reg-reg move (2).  */
 495   {2, 2, 2},                            /* cost of storing integer registers */
 496   2,                                    /* cost of reg,reg fld/fst */
 497   {2, 2, 6},                            /* cost of loading fp registers
 498                                            in SFmode, DFmode and XFmode */
 499   {4, 4, 6},                            /* cost of storing fp registers
 500                                            in SFmode, DFmode and XFmode */
 501   2,                                    /* cost of moving MMX register */
 502   {2, 2},                               /* cost of loading MMX registers
 503                                            in SImode and DImode */
 504   {2, 2},                               /* cost of storing MMX registers
 505                                            in SImode and DImode */
 506   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 507   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 508                                            in 32,64,128,256 and 512-bit */
 509   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 510   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 511                                            in 32,64,128,256 and 512-bit */
 512   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 513   3, 3,                                 /* SSE->integer and integer->SSE moves */
 514   4, 4,                                 /* Gather load static, per_elt.  */
 515   4, 4,                                 /* Gather store static, per_elt.  */
 516   8,                                    /* size of l1 cache.  */
 517   256,                                  /* size of l2 cache  */
 518   32,                                   /* size of prefetch block */
 519   6,                                    /* number of parallel prefetches */
 520   2,                                    /* Branch cost */
 521   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 522   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
 523   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 524   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 525   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 526   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 527
 528   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 529   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 530   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 531   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 532   COSTS_N_INSNS (7),                    /* cost of FMA SS instruction.  */
 533   COSTS_N_INSNS (7),                    /* cost of FMA SD instruction.  */
 534   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
 535   COSTS_N_INSNS (18),                   /* cost of DIVSD instruction.  */
 536   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 537   COSTS_N_INSNS (31),                   /* cost of SQRTSD instruction.  */
 538   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 539   pentiumpro_memcpy,
 540   pentiumpro_memset,
 541   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 542   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 543 };
 544
 545 static stringop_algs geode_memcpy[2] = {
 546   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 547   DUMMY_STRINGOP_ALGS};
 548 static stringop_algs geode_memset[2] = {
 549   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 550   DUMMY_STRINGOP_ALGS};
 551 static const
 552 struct processor_costs geode_cost = {
 553   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 554   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 555   COSTS_N_INSNS (2),                    /* variable shift costs */
 556   COSTS_N_INSNS (1),                    /* constant shift costs */
 557   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 558    COSTS_N_INSNS (4),                   /*                               HI */
 559    COSTS_N_INSNS (7),                   /*                               SI */
 560    COSTS_N_INSNS (7),                   /*                               DI */
 561    COSTS_N_INSNS (7)},                  /*                            other */
 562   0,                                    /* cost of multiply per each bit set */
 563   {COSTS_N_INSNS (15),                  /* cost of a divide/mod for QI */
 564    COSTS_N_INSNS (23),                  /*                          HI */
 565    COSTS_N_INSNS (39),                  /*                          SI */
 566    COSTS_N_INSNS (39),                  /*                          DI */
 567    COSTS_N_INSNS (39)},                 /*                          other */
 568   COSTS_N_INSNS (1),                    /* cost of movsx */
 569   COSTS_N_INSNS (1),                    /* cost of movzx */
 570   8,                                    /* "large" insn */
 571   4,                                    /* MOVE_RATIO */
 572
 573   /* All move costs are relative to integer->integer move times 2 and thus
 574      they are latency*2. */
 575   2,                                 /* cost for loading QImode using movzbl */
 576   {2, 2, 2},                            /* cost of loading integer registers
 577                                            in QImode, HImode and SImode.
 578                                            Relative to reg-reg move (2).  */
 579   {2, 2, 2},                            /* cost of storing integer registers */
 580   2,                                    /* cost of reg,reg fld/fst */
 581   {2, 2, 2},                            /* cost of loading fp registers
 582                                            in SFmode, DFmode and XFmode */
 583   {4, 6, 6},                            /* cost of storing fp registers
 584                                            in SFmode, DFmode and XFmode */
 585
 586   2,                                    /* cost of moving MMX register */
 587   {2, 2},                               /* cost of loading MMX registers
 588                                            in SImode and DImode */
 589   {2, 2},                               /* cost of storing MMX registers
 590                                            in SImode and DImode */
 591   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 592   {2, 2, 8, 16, 32},                    /* cost of loading SSE registers
 593                                            in 32,64,128,256 and 512-bit */
 594   {2, 2, 8, 16, 32},                    /* cost of unaligned loads.  */
 595   {2, 2, 8, 16, 32},                    /* cost of storing SSE registers
 596                                            in 32,64,128,256 and 512-bit */
 597   {2, 2, 8, 16, 32},                    /* cost of unaligned stores.  */
 598   6, 6,                                 /* SSE->integer and integer->SSE moves */
 599   2, 2,                                 /* Gather load static, per_elt.  */
 600   2, 2,                                 /* Gather store static, per_elt.  */
 601   64,                                   /* size of l1 cache.  */
 602   128,                                  /* size of l2 cache.  */
 603   32,                                   /* size of prefetch block */
 604   1,                                    /* number of parallel prefetches */
 605   1,                                    /* Branch cost */
 606   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
 607   COSTS_N_INSNS (11),                   /* cost of FMUL instruction.  */
 608   COSTS_N_INSNS (47),                   /* cost of FDIV instruction.  */
 609   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 610   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 611   COSTS_N_INSNS (54),                   /* cost of FSQRT instruction.  */
 612
 613   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 614   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 615   COSTS_N_INSNS (11),                   /* cost of MULSS instruction.  */
 616   COSTS_N_INSNS (11),                   /* cost of MULSD instruction.  */
 617   COSTS_N_INSNS (17),                   /* cost of FMA SS instruction.  */
 618   COSTS_N_INSNS (17),                   /* cost of FMA SD instruction.  */
 619   COSTS_N_INSNS (47),                   /* cost of DIVSS instruction.  */
 620   COSTS_N_INSNS (47),                   /* cost of DIVSD instruction.  */
 621   COSTS_N_INSNS (54),                   /* cost of SQRTSS instruction.  */
 622   COSTS_N_INSNS (54),                   /* cost of SQRTSD instruction.  */
 623   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 624   geode_memcpy,
 625   geode_memset,
 626   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 627   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 628 };
 629
 630 static stringop_algs k6_memcpy[2] = {
 631   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 632   DUMMY_STRINGOP_ALGS};
 633 static stringop_algs k6_memset[2] = {
 634   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 635   DUMMY_STRINGOP_ALGS};
 636 static const
 637 struct processor_costs k6_cost = {
 638   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 639   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 640   COSTS_N_INSNS (1),                    /* variable shift costs */
 641   COSTS_N_INSNS (1),                    /* constant shift costs */
 642   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 643    COSTS_N_INSNS (3),                   /*                               HI */
 644    COSTS_N_INSNS (3),                   /*                               SI */
 645    COSTS_N_INSNS (3),                   /*                               DI */
 646    COSTS_N_INSNS (3)},                  /*                            other */
 647   0,                                    /* cost of multiply per each bit set */
 648   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 649    COSTS_N_INSNS (18),                  /*                          HI */
 650    COSTS_N_INSNS (18),                  /*                          SI */
 651    COSTS_N_INSNS (18),                  /*                          DI */
 652    COSTS_N_INSNS (18)},                 /*                          other */
 653   COSTS_N_INSNS (2),                    /* cost of movsx */
 654   COSTS_N_INSNS (2),                    /* cost of movzx */
 655   8,                                    /* "large" insn */
 656   4,                                    /* MOVE_RATIO */
 657
 658   /* All move costs are relative to integer->integer move times 2 and thus
 659      they are latency*2. */
 660   3,                                 /* cost for loading QImode using movzbl */
 661   {4, 5, 4},                            /* cost of loading integer registers
 662                                            in QImode, HImode and SImode.
 663                                            Relative to reg-reg move (2).  */
 664   {2, 3, 2},                            /* cost of storing integer registers */
 665   4,                                    /* cost of reg,reg fld/fst */
 666   {6, 6, 6},                            /* cost of loading fp registers
 667                                            in SFmode, DFmode and XFmode */
 668   {4, 4, 4},                            /* cost of storing fp registers
 669                                            in SFmode, DFmode and XFmode */
 670   2,                                    /* cost of moving MMX register */
 671   {2, 2},                               /* cost of loading MMX registers
 672                                            in SImode and DImode */
 673   {2, 2},                               /* cost of storing MMX registers
 674                                            in SImode and DImode */
 675   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 676   {2, 2, 8, 16, 32},                    /* cost of loading SSE registers
 677                                            in 32,64,128,256 and 512-bit */
 678   {2, 2, 8, 16, 32},                    /* cost of unaligned loads.  */
 679   {2, 2, 8, 16, 32},                    /* cost of storing SSE registers
 680                                            in 32,64,128,256 and 512-bit */
 681   {2, 2, 8, 16, 32},                    /* cost of unaligned stores.  */
 682   6, 6,                                 /* SSE->integer and integer->SSE moves */
 683   2, 2,                                 /* Gather load static, per_elt.  */
 684   2, 2,                                 /* Gather store static, per_elt.  */
 685   32,                                   /* size of l1 cache.  */
 686   32,                                   /* size of l2 cache.  Some models
 687                                            have integrated l2 cache, but
 688                                            optimizing for k6 is not important
 689                                            enough to worry about that.  */
 690   32,                                   /* size of prefetch block */
 691   1,                                    /* number of parallel prefetches */
 692   1,                                    /* Branch cost */
 693   COSTS_N_INSNS (2),                    /* cost of FADD and FSUB insns.  */
 694   COSTS_N_INSNS (2),                    /* cost of FMUL instruction.  */
 695   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 696   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 697   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 698   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 699
 700   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 701   COSTS_N_INSNS (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 702   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
 703   COSTS_N_INSNS (2),                    /* cost of MULSD instruction.  */
 704   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
 705   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
 706   COSTS_N_INSNS (56),                   /* cost of DIVSS instruction.  */
 707   COSTS_N_INSNS (56),                   /* cost of DIVSD instruction.  */
 708   COSTS_N_INSNS (56),                   /* cost of SQRTSS instruction.  */
 709   COSTS_N_INSNS (56),                   /* cost of SQRTSD instruction.  */
 710   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 711   k6_memcpy,
 712   k6_memset,
 713   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 714   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 715 };
 716
 717 /* For some reason, Athlon deals better with REP prefix (relative to loops)
 718    compared to K8. Alignment becomes important after 8 bytes for memcpy and
 719    128 bytes for memset.  */
 720 static stringop_algs athlon_memcpy[2] = {
 721   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 722   DUMMY_STRINGOP_ALGS};
 723 static stringop_algs athlon_memset[2] = {
 724   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 725   DUMMY_STRINGOP_ALGS};
 726 static const
 727 struct processor_costs athlon_cost = {
 728   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 729   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 730   COSTS_N_INSNS (1),                    /* variable shift costs */
 731   COSTS_N_INSNS (1),                    /* constant shift costs */
 732   {COSTS_N_INSNS (5),                   /* cost of starting multiply for QI */
 733    COSTS_N_INSNS (5),                   /*                               HI */
 734    COSTS_N_INSNS (5),                   /*                               SI */
 735    COSTS_N_INSNS (5),                   /*                               DI */
 736    COSTS_N_INSNS (5)},                  /*                            other */
 737   0,                                    /* cost of multiply per each bit set */
 738   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 739    COSTS_N_INSNS (26),                  /*                          HI */
 740    COSTS_N_INSNS (42),                  /*                          SI */
 741    COSTS_N_INSNS (74),                  /*                          DI */
 742    COSTS_N_INSNS (74)},                 /*                          other */
 743   COSTS_N_INSNS (1),                    /* cost of movsx */
 744   COSTS_N_INSNS (1),                    /* cost of movzx */
 745   8,                                    /* "large" insn */
 746   9,                                    /* MOVE_RATIO */
 747
 748   /* All move costs are relative to integer->integer move times 2 and thus
 749      they are latency*2. */
 750   4,                                 /* cost for loading QImode using movzbl */
 751   {3, 4, 3},                            /* cost of loading integer registers
 752                                            in QImode, HImode and SImode.
 753                                            Relative to reg-reg move (2).  */
 754   {3, 4, 3},                            /* cost of storing integer registers */
 755   4,                                    /* cost of reg,reg fld/fst */
 756   {4, 4, 12},                           /* cost of loading fp registers
 757                                            in SFmode, DFmode and XFmode */
 758   {6, 6, 8},                            /* cost of storing fp registers
 759                                            in SFmode, DFmode and XFmode */
 760   2,                                    /* cost of moving MMX register */
 761   {4, 4},                               /* cost of loading MMX registers
 762                                            in SImode and DImode */
 763   {4, 4},                               /* cost of storing MMX registers
 764                                            in SImode and DImode */
 765   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 766   {4, 4, 6, 12, 24},                    /* cost of loading SSE registers
 767                                            in 32,64,128,256 and 512-bit */
 768   {4, 4, 6, 12, 24},                    /* cost of unaligned loads.  */
 769   {4, 4, 5, 10, 20},                    /* cost of storing SSE registers
 770                                            in 32,64,128,256 and 512-bit */
 771   {4, 4, 5, 10, 20},                    /* cost of unaligned stores.  */
 772   5, 5,                                 /* SSE->integer and integer->SSE moves */
 773   4, 4,                                 /* Gather load static, per_elt.  */
 774   4, 4,                                 /* Gather store static, per_elt.  */
 775   64,                                   /* size of l1 cache.  */
 776   256,                                  /* size of l2 cache.  */
 777   64,                                   /* size of prefetch block */
 778   6,                                    /* number of parallel prefetches */
 779   5,                                    /* Branch cost */
 780   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 781   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 782   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
 783   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 784   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 785   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 786
 787   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
 788   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 789   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 790   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 791   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
 792   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
 793   /* 11-16  */
 794   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
 795   COSTS_N_INSNS (24),                   /* cost of DIVSD instruction.  */
 796   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
 797   COSTS_N_INSNS (19),                   /* cost of SQRTSD instruction.  */
 798   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 799   athlon_memcpy,
 800   athlon_memset,
 801   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 802   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 803 };
 804
 805 /* K8 has optimized REP instruction for medium sized blocks, but for very
 806    small blocks it is better to use loop. For large blocks, libcall can
 807    do nontemporary accesses and beat inline considerably.  */
 808 static stringop_algs k8_memcpy[2] = {
 809   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
 810              {-1, rep_prefix_4_byte, false}}},
 811   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
 812              {-1, libcall, false}}}};
 813 static stringop_algs k8_memset[2] = {
 814   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 815              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 816   {libcall, {{48, unrolled_loop, false},
 817              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 818 static const
 819 struct processor_costs k8_cost = {
 820   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 821   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 822   COSTS_N_INSNS (1),                    /* variable shift costs */
 823   COSTS_N_INSNS (1),                    /* constant shift costs */
 824   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 825    COSTS_N_INSNS (4),                   /*                               HI */
 826    COSTS_N_INSNS (3),                   /*                               SI */
 827    COSTS_N_INSNS (4),                   /*                               DI */
 828    COSTS_N_INSNS (5)},                  /*                            other */
 829   0,                                    /* cost of multiply per each bit set */
 830   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 831    COSTS_N_INSNS (26),                  /*                          HI */
 832    COSTS_N_INSNS (42),                  /*                          SI */
 833    COSTS_N_INSNS (74),                  /*                          DI */
 834    COSTS_N_INSNS (74)},                 /*                          other */
 835   COSTS_N_INSNS (1),                    /* cost of movsx */
 836   COSTS_N_INSNS (1),                    /* cost of movzx */
 837   8,                                    /* "large" insn */
 838   9,                                    /* MOVE_RATIO */
 839
 840   /* All move costs are relative to integer->integer move times 2 and thus
 841      they are latency*2. */
 842   4,                                 /* cost for loading QImode using movzbl */
 843   {3, 4, 3},                            /* cost of loading integer registers
 844                                            in QImode, HImode and SImode.
 845                                            Relative to reg-reg move (2).  */
 846   {3, 4, 3},                            /* cost of storing integer registers */
 847   4,                                    /* cost of reg,reg fld/fst */
 848   {4, 4, 12},                           /* cost of loading fp registers
 849                                            in SFmode, DFmode and XFmode */
 850   {6, 6, 8},                            /* cost of storing fp registers
 851                                            in SFmode, DFmode and XFmode */
 852   2,                                    /* cost of moving MMX register */
 853   {3, 3},                               /* cost of loading MMX registers
 854                                            in SImode and DImode */
 855   {4, 4},                               /* cost of storing MMX registers
 856                                            in SImode and DImode */
 857   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 858   {4, 3, 6, 12, 24},                    /* cost of loading SSE registers
 859                                            in 32,64,128,256 and 512-bit */
 860   {4, 3, 6, 12, 24},                    /* cost of unaligned loads.  */
 861   {4, 4, 5, 10, 20},                    /* cost of storing SSE registers
 862                                            in 32,64,128,256 and 512-bit */
 863   {4, 4, 5, 10, 20},                    /* cost of unaligned stores.  */
 864   5, 5,                                 /* SSE->integer and integer->SSE moves */
 865   4, 4,                                 /* Gather load static, per_elt.  */
 866   4, 4,                                 /* Gather store static, per_elt.  */
 867   64,                                   /* size of l1 cache.  */
 868   512,                                  /* size of l2 cache.  */
 869   64,                                   /* size of prefetch block */
 870   /* New AMD processors never drop prefetches; if they cannot be performed
 871      immediately, they are queued.  We set number of simultaneous prefetches
 872      to a large constant to reflect this (it probably is not a good idea not
 873      to limit number of prefetches at all, as their execution also takes some
 874      time).  */
 875   100,                                  /* number of parallel prefetches */
 876   3,                                    /* Branch cost */
 877   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 878   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 879   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
 880   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 881   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 882   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 883
 884   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
 885   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 886   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 887   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 888   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
 889   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
 890   /* 11-16  */
 891   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
 892   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
 893   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
 894   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
 895   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 896   k8_memcpy,
 897   k8_memset,
 898   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 899   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
 900 };
 901
 902 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
 903    very small blocks it is better to use loop. For large blocks, libcall can
 904    do nontemporary accesses and beat inline considerably.  */
 905 static stringop_algs amdfam10_memcpy[2] = {
 906   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
 907              {-1, rep_prefix_4_byte, false}}},
 908   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
 909              {-1, libcall, false}}}};
 910 static stringop_algs amdfam10_memset[2] = {
 911   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 912              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 913   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
 914              {-1, libcall, false}}}};
 915 struct processor_costs amdfam10_cost = {
 916   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 917   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 918   COSTS_N_INSNS (1),                    /* variable shift costs */
 919   COSTS_N_INSNS (1),                    /* constant shift costs */
 920   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 921    COSTS_N_INSNS (4),                   /*                               HI */
 922    COSTS_N_INSNS (3),                   /*                               SI */
 923    COSTS_N_INSNS (4),                   /*                               DI */
 924    COSTS_N_INSNS (5)},                  /*                            other */
 925   0,                                    /* cost of multiply per each bit set */
 926   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
 927    COSTS_N_INSNS (35),                  /*                          HI */
 928    COSTS_N_INSNS (51),                  /*                          SI */
 929    COSTS_N_INSNS (83),                  /*                          DI */
 930    COSTS_N_INSNS (83)},                 /*                          other */
 931   COSTS_N_INSNS (1),                    /* cost of movsx */
 932   COSTS_N_INSNS (1),                    /* cost of movzx */
 933   8,                                    /* "large" insn */
 934   9,                                    /* MOVE_RATIO */
 935
 936   /* All move costs are relative to integer->integer move times 2 and thus
 937      they are latency*2. */
 938   4,                                 /* cost for loading QImode using movzbl */
 939   {3, 4, 3},                            /* cost of loading integer registers
 940                                            in QImode, HImode and SImode.
 941                                            Relative to reg-reg move (2).  */
 942   {3, 4, 3},                            /* cost of storing integer registers */
 943   4,                                    /* cost of reg,reg fld/fst */
 944   {4, 4, 12},                           /* cost of loading fp registers
 945                                            in SFmode, DFmode and XFmode */
 946   {6, 6, 8},                            /* cost of storing fp registers
 947                                            in SFmode, DFmode and XFmode */
 948   2,                                    /* cost of moving MMX register */
 949   {3, 3},                               /* cost of loading MMX registers
 950                                            in SImode and DImode */
 951   {4, 4},                               /* cost of storing MMX registers
 952                                            in SImode and DImode */
 953   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 954   {4, 4, 3, 6, 12},                     /* cost of loading SSE registers
 955                                            in 32,64,128,256 and 512-bit */
 956   {4, 4, 3, 7, 12},                     /* cost of unaligned loads.  */
 957   {4, 4, 5, 10, 20},                    /* cost of storing SSE registers
 958                                            in 32,64,128,256 and 512-bit */
 959   {4, 4, 5, 10, 20},                    /* cost of unaligned stores.  */
 960   3, 3,                                 /* SSE->integer and integer->SSE moves */
 961                                         /* On K8:
 962                                             MOVD reg64, xmmreg Double FSTORE 4
 963                                             MOVD reg32, xmmreg Double FSTORE 4
 964                                            On AMDFAM10:
 965                                             MOVD reg64, xmmreg Double FADD 3
 966                                                                1/1  1/1
 967                                             MOVD reg32, xmmreg Double FADD 3
 968                                                                1/1  1/1 */
 969   4, 4,                                 /* Gather load static, per_elt.  */
 970   4, 4,                                 /* Gather store static, per_elt.  */
 971   64,                                   /* size of l1 cache.  */
 972   512,                                  /* size of l2 cache.  */
 973   64,                                   /* size of prefetch block */
 974   /* New AMD processors never drop prefetches; if they cannot be performed
 975      immediately, they are queued.  We set number of simultaneous prefetches
 976      to a large constant to reflect this (it probably is not a good idea not
 977      to limit number of prefetches at all, as their execution also takes some
 978      time).  */
 979   100,                                  /* number of parallel prefetches */
 980   2,                                    /* Branch cost */
 981   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 982   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 983   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
 984   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 985   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 986   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 987
 988   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
 989   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 990   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 991   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 992   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
 993   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
 994   /* 11-16  */
 995   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
 996   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
 997   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
 998   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
 999   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1000   amdfam10_memcpy,
1001   amdfam10_memset,
1002   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1003   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1004 };
1005
1006 /*  BDVER1 has optimized REP instruction for medium sized blocks, but for
1007     very small blocks it is better to use loop. For large blocks, libcall
1008     can do nontemporary accesses and beat inline considerably.  */
1009 static stringop_algs bdver1_memcpy[2] = {
1010   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1011              {-1, rep_prefix_4_byte, false}}},
1012   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1013              {-1, libcall, false}}}};
1014 static stringop_algs bdver1_memset[2] = {
1015   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1016              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1017   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1018              {-1, libcall, false}}}};
1019
1020 const struct processor_costs bdver1_cost = {
1021   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1022   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1023   COSTS_N_INSNS (1),                    /* variable shift costs */
1024   COSTS_N_INSNS (1),                    /* constant shift costs */
1025   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1026    COSTS_N_INSNS (4),                   /*                               HI */
1027    COSTS_N_INSNS (4),                   /*                               SI */
1028    COSTS_N_INSNS (6),                   /*                               DI */
1029    COSTS_N_INSNS (6)},                  /*                            other */
1030   0,                                    /* cost of multiply per each bit set */
1031   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1032    COSTS_N_INSNS (35),                  /*                          HI */
1033    COSTS_N_INSNS (51),                  /*                          SI */
1034    COSTS_N_INSNS (83),                  /*                          DI */
1035    COSTS_N_INSNS (83)},                 /*                          other */
1036   COSTS_N_INSNS (1),                    /* cost of movsx */
1037   COSTS_N_INSNS (1),                    /* cost of movzx */
1038   8,                                    /* "large" insn */
1039   9,                                    /* MOVE_RATIO */
1040
1041   /* All move costs are relative to integer->integer move times 2 and thus
1042      they are latency*2. */
1043   8,                                 /* cost for loading QImode using movzbl */
1044   {8, 8, 8},                            /* cost of loading integer registers
1045                                            in QImode, HImode and SImode.
1046                                            Relative to reg-reg move (2).  */
1047   {8, 8, 8},                            /* cost of storing integer registers */
1048   4,                                    /* cost of reg,reg fld/fst */
1049   {12, 12, 28},                         /* cost of loading fp registers
1050                                            in SFmode, DFmode and XFmode */
1051   {10, 10, 18},                         /* cost of storing fp registers
1052                                            in SFmode, DFmode and XFmode */
1053   4,                                    /* cost of moving MMX register */
1054   {12, 12},                             /* cost of loading MMX registers
1055                                            in SImode and DImode */
1056   {10, 10},                             /* cost of storing MMX registers
1057                                            in SImode and DImode */
1058   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1059   {12, 12, 10, 20, 30},                 /* cost of loading SSE registers
1060                                            in 32,64,128,256 and 512-bit */
1061   {12, 12, 10, 20, 30},                 /* cost of unaligned loads.  */
1062   {10, 10, 10, 20, 30},                 /* cost of storing SSE registers
1063                                            in 32,64,128,256 and 512-bit */
1064   {10, 10, 10, 20, 30},                 /* cost of unaligned stores.  */
1065   16, 20,                               /* SSE->integer and integer->SSE moves */
1066   12, 12,                               /* Gather load static, per_elt.  */
1067   10, 10,                               /* Gather store static, per_elt.  */
1068   16,                                   /* size of l1 cache.  */
1069   2048,                                 /* size of l2 cache.  */
1070   64,                                   /* size of prefetch block */
1071   /* New AMD processors never drop prefetches; if they cannot be performed
1072      immediately, they are queued.  We set number of simultaneous prefetches
1073      to a large constant to reflect this (it probably is not a good idea not
1074      to limit number of prefetches at all, as their execution also takes some
1075      time).  */
1076   100,                                  /* number of parallel prefetches */
1077   2,                                    /* Branch cost */
1078   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1079   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1080   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1081   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1082   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1083   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1084
1085   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1086   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1087   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1088   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1089   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1090   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1091   /* 9-24  */
1092   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1093   /* 9-27  */
1094   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1095   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1096   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1097   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1098   bdver1_memcpy,
1099   bdver1_memset,
1100   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1101   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1102 };
1103
1104 /*  BDVER2 has optimized REP instruction for medium sized blocks, but for
1105     very small blocks it is better to use loop. For large blocks, libcall
1106     can do nontemporary accesses and beat inline considerably.  */
1107
1108 static stringop_algs bdver2_memcpy[2] = {
1109   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1110              {-1, rep_prefix_4_byte, false}}},
1111   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1112              {-1, libcall, false}}}};
1113 static stringop_algs bdver2_memset[2] = {
1114   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1115              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1116   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1117              {-1, libcall, false}}}};
1118
1119 const struct processor_costs bdver2_cost = {
1120   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1121   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1122   COSTS_N_INSNS (1),                    /* variable shift costs */
1123   COSTS_N_INSNS (1),                    /* constant shift costs */
1124   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1125    COSTS_N_INSNS (4),                   /*                               HI */
1126    COSTS_N_INSNS (4),                   /*                               SI */
1127    COSTS_N_INSNS (6),                   /*                               DI */
1128    COSTS_N_INSNS (6)},                  /*                            other */
1129   0,                                    /* cost of multiply per each bit set */
1130   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1131    COSTS_N_INSNS (35),                  /*                          HI */
1132    COSTS_N_INSNS (51),                  /*                          SI */
1133    COSTS_N_INSNS (83),                  /*                          DI */
1134    COSTS_N_INSNS (83)},                 /*                          other */
1135   COSTS_N_INSNS (1),                    /* cost of movsx */
1136   COSTS_N_INSNS (1),                    /* cost of movzx */
1137   8,                                    /* "large" insn */
1138   9,                                    /* MOVE_RATIO */
1139
1140   /* All move costs are relative to integer->integer move times 2 and thus
1141      they are latency*2. */
1142   8,                                 /* cost for loading QImode using movzbl */
1143   {8, 8, 8},                            /* cost of loading integer registers
1144                                            in QImode, HImode and SImode.
1145                                            Relative to reg-reg move (2).  */
1146   {8, 8, 8},                            /* cost of storing integer registers */
1147   4,                                    /* cost of reg,reg fld/fst */
1148   {12, 12, 28},                         /* cost of loading fp registers
1149                                            in SFmode, DFmode and XFmode */
1150   {10, 10, 18},                         /* cost of storing fp registers
1151                                            in SFmode, DFmode and XFmode */
1152   4,                                    /* cost of moving MMX register */
1153   {12, 12},                             /* cost of loading MMX registers
1154                                            in SImode and DImode */
1155   {10, 10},                             /* cost of storing MMX registers
1156                                            in SImode and DImode */
1157   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1158   {12, 12, 10, 20, 30},                 /* cost of loading SSE registers
1159                                            in 32,64,128,256 and 512-bit */
1160   {12, 12, 10, 20, 30},                 /* cost of unaligned loads.  */
1161   {10, 10, 10, 20, 30},                 /* cost of storing SSE registers
1162                                            in 32,64,128,256 and 512-bit */
1163   {10, 10, 10, 20, 30},                 /* cost of unaligned stores.  */
1164   16, 20,                               /* SSE->integer and integer->SSE moves */
1165   12, 12,                               /* Gather load static, per_elt.  */
1166   10, 10,                               /* Gather store static, per_elt.  */
1167   16,                                   /* size of l1 cache.  */
1168   2048,                                 /* size of l2 cache.  */
1169   64,                                   /* size of prefetch block */
1170   /* New AMD processors never drop prefetches; if they cannot be performed
1171      immediately, they are queued.  We set number of simultaneous prefetches
1172      to a large constant to reflect this (it probably is not a good idea not
1173      to limit number of prefetches at all, as their execution also takes some
1174      time).  */
1175   100,                                  /* number of parallel prefetches */
1176   2,                                    /* Branch cost */
1177   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1178   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1179   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1180   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1181   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1182   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1183
1184   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1185   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1186   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1187   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1188   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1189   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1190   /* 9-24  */
1191   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1192   /* 9-27  */
1193   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1194   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1195   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1196   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1197   bdver2_memcpy,
1198   bdver2_memset,
1199   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1200   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1201 };
1202
1203
1204   /*  BDVER3 has optimized REP instruction for medium sized blocks, but for
1205       very small blocks it is better to use loop. For large blocks, libcall
1206       can do nontemporary accesses and beat inline considerably.  */
1207 static stringop_algs bdver3_memcpy[2] = {
1208   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1209              {-1, rep_prefix_4_byte, false}}},
1210   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1211              {-1, libcall, false}}}};
1212 static stringop_algs bdver3_memset[2] = {
1213   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1214              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1215   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1216              {-1, libcall, false}}}};
1217 struct processor_costs bdver3_cost = {
1218   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1219   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1220   COSTS_N_INSNS (1),                    /* variable shift costs */
1221   COSTS_N_INSNS (1),                    /* constant shift costs */
1222   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1223    COSTS_N_INSNS (4),                   /*                               HI */
1224    COSTS_N_INSNS (4),                   /*                               SI */
1225    COSTS_N_INSNS (6),                   /*                               DI */
1226    COSTS_N_INSNS (6)},                  /*                            other */
1227   0,                                    /* cost of multiply per each bit set */
1228   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1229    COSTS_N_INSNS (35),                  /*                          HI */
1230    COSTS_N_INSNS (51),                  /*                          SI */
1231    COSTS_N_INSNS (83),                  /*                          DI */
1232    COSTS_N_INSNS (83)},                 /*                          other */
1233   COSTS_N_INSNS (1),                    /* cost of movsx */
1234   COSTS_N_INSNS (1),                    /* cost of movzx */
1235   8,                                    /* "large" insn */
1236   9,                                    /* MOVE_RATIO */
1237
1238   /* All move costs are relative to integer->integer move times 2 and thus
1239      they are latency*2. */
1240   8,                                 /* cost for loading QImode using movzbl */
1241   {8, 8, 8},                            /* cost of loading integer registers
1242                                            in QImode, HImode and SImode.
1243                                            Relative to reg-reg move (2).  */
1244   {8, 8, 8},                            /* cost of storing integer registers */
1245   4,                                    /* cost of reg,reg fld/fst */
1246   {12, 12, 28},                         /* cost of loading fp registers
1247                                            in SFmode, DFmode and XFmode */
1248   {10, 10, 18},                         /* cost of storing fp registers
1249                                            in SFmode, DFmode and XFmode */
1250   4,                                    /* cost of moving MMX register */
1251   {12, 12},                             /* cost of loading MMX registers
1252                                            in SImode and DImode */
1253   {10, 10},                             /* cost of storing MMX registers
1254                                            in SImode and DImode */
1255   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1256   {12, 12, 10, 20, 30},                 /* cost of loading SSE registers
1257                                            in 32,64,128,256 and 512-bit */
1258   {12, 12, 10, 20, 30},                 /* cost of unaligned loads.  */
1259   {10, 10, 10, 20, 30},                 /* cost of storing SSE registers
1260                                            in 32,64,128,256 and 512-bit */
1261   {10, 10, 10, 20, 30},                 /* cost of unaligned stores.  */
1262   16, 20,                               /* SSE->integer and integer->SSE moves */
1263   12, 12,                               /* Gather load static, per_elt.  */
1264   10, 10,                               /* Gather store static, per_elt.  */
1265   16,                                   /* size of l1 cache.  */
1266   2048,                                 /* size of l2 cache.  */
1267   64,                                   /* size of prefetch block */
1268   /* New AMD processors never drop prefetches; if they cannot be performed
1269      immediately, they are queued.  We set number of simultaneous prefetches
1270      to a large constant to reflect this (it probably is not a good idea not
1271      to limit number of prefetches at all, as their execution also takes some
1272      time).  */
1273   100,                                  /* number of parallel prefetches */
1274   2,                                    /* Branch cost */
1275   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1276   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1277   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1278   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1279   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1280   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1281
1282   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1283   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1284   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1285   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1286   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1287   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1288   /* 9-24  */
1289   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1290   /* 9-27  */
1291   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1292   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1293   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1294   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1295   bdver3_memcpy,
1296   bdver3_memset,
1297   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1298   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1299 };
1300
1301 /*  BDVER4 has optimized REP instruction for medium sized blocks, but for
1302     very small blocks it is better to use loop. For large blocks, libcall
1303     can do nontemporary accesses and beat inline considerably.  */
1304 static stringop_algs bdver4_memcpy[2] = {
1305   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1306              {-1, rep_prefix_4_byte, false}}},
1307   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1308              {-1, libcall, false}}}};
1309 static stringop_algs bdver4_memset[2] = {
1310   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1311              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1312   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1313              {-1, libcall, false}}}};
1314 struct processor_costs bdver4_cost = {
1315   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1316   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1317   COSTS_N_INSNS (1),                    /* variable shift costs */
1318   COSTS_N_INSNS (1),                    /* constant shift costs */
1319   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1320    COSTS_N_INSNS (4),                   /*                               HI */
1321    COSTS_N_INSNS (4),                   /*                               SI */
1322    COSTS_N_INSNS (6),                   /*                               DI */
1323    COSTS_N_INSNS (6)},                  /*                            other */
1324   0,                                    /* cost of multiply per each bit set */
1325   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1326    COSTS_N_INSNS (35),                  /*                          HI */
1327    COSTS_N_INSNS (51),                  /*                          SI */
1328    COSTS_N_INSNS (83),                  /*                          DI */
1329    COSTS_N_INSNS (83)},                 /*                          other */
1330   COSTS_N_INSNS (1),                    /* cost of movsx */
1331   COSTS_N_INSNS (1),                    /* cost of movzx */
1332   8,                                    /* "large" insn */
1333   9,                                    /* MOVE_RATIO */
1334
1335   /* All move costs are relative to integer->integer move times 2 and thus
1336      they are latency*2. */
1337   8,                                 /* cost for loading QImode using movzbl */
1338   {8, 8, 8},                            /* cost of loading integer registers
1339                                            in QImode, HImode and SImode.
1340                                            Relative to reg-reg move (2).  */
1341   {8, 8, 8},                            /* cost of storing integer registers */
1342   4,                                    /* cost of reg,reg fld/fst */
1343   {12, 12, 28},                         /* cost of loading fp registers
1344                                            in SFmode, DFmode and XFmode */
1345   {10, 10, 18},                         /* cost of storing fp registers
1346                                            in SFmode, DFmode and XFmode */
1347   4,                                    /* cost of moving MMX register */
1348   {12, 12},                             /* cost of loading MMX registers
1349                                            in SImode and DImode */
1350   {10, 10},                             /* cost of storing MMX registers
1351                                            in SImode and DImode */
1352   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1353   {12, 12, 10, 20, 30},                 /* cost of loading SSE registers
1354                                            in 32,64,128,256 and 512-bit */
1355   {12, 12, 10, 20, 30},                 /* cost of unaligned loads.  */
1356   {10, 10, 10, 20, 30},                 /* cost of storing SSE registers
1357                                            in 32,64,128,256 and 512-bit */
1358   {10, 10, 10, 20, 30},                 /* cost of unaligned stores.  */
1359   16, 20,                               /* SSE->integer and integer->SSE moves */
1360   12, 12,                               /* Gather load static, per_elt.  */
1361   10, 10,                               /* Gather store static, per_elt.  */
1362   16,                                   /* size of l1 cache.  */
1363   2048,                                 /* size of l2 cache.  */
1364   64,                                   /* size of prefetch block */
1365   /* New AMD processors never drop prefetches; if they cannot be performed
1366      immediately, they are queued.  We set number of simultaneous prefetches
1367      to a large constant to reflect this (it probably is not a good idea not
1368      to limit number of prefetches at all, as their execution also takes some
1369      time).  */
1370   100,                                  /* number of parallel prefetches */
1371   2,                                    /* Branch cost */
1372   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1373   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1374   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1375   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1376   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1377   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1378
1379   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1380   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1381   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1382   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1383   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1384   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1385   /* 9-24  */
1386   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1387   /* 9-27  */
1388   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1389   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1390   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1391   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1392   bdver4_memcpy,
1393   bdver4_memset,
1394   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1395   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1396 };
1397
1398
1399 /*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
1400     very small blocks it is better to use loop.  For large blocks, libcall
1401     can do nontemporary accesses and beat inline considerably.  */
1402 static stringop_algs znver1_memcpy[2] = {
1403   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1404              {-1, rep_prefix_4_byte, false}}},
1405   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1406              {-1, libcall, false}}}};
1407 static stringop_algs znver1_memset[2] = {
1408   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1409              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1410   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1411              {-1, libcall, false}}}};
1412 struct processor_costs znver1_cost = {
1413   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1414   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1415   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1416   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1417   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1418    COSTS_N_INSNS (3),                   /*                               HI.  */
1419    COSTS_N_INSNS (3),                   /*                               SI.  */
1420    COSTS_N_INSNS (3),                   /*                               DI.  */
1421    COSTS_N_INSNS (3)},                  /*                            other.  */
1422   0,                                    /* cost of multiply per each bit
1423                                             set.  */
1424    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1425       bound.  */
1426   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI.  */
1427    COSTS_N_INSNS (22),                  /*                          HI.  */
1428    COSTS_N_INSNS (30),                  /*                          SI.  */
1429    COSTS_N_INSNS (45),                  /*                          DI.  */
1430    COSTS_N_INSNS (45)},                 /*                          other.  */
1431   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1432   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1433   8,                                    /* "large" insn.  */
1434   9,                                    /* MOVE_RATIO.  */
1435
1436   /* All move costs are relative to integer->integer move times 2 and thus
1437      they are latency*2. */
1438
1439   /* reg-reg moves are done by renaming and thus they are even cheaper than
1440      1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1441      to doubles of latencies, we do not model this correctly.  It does not
1442      seem to make practical difference to bump prices up even more.  */
1443   6,                                    /* cost for loading QImode using
1444                                            movzbl.  */
1445   {6, 6, 6},                            /* cost of loading integer registers
1446                                            in QImode, HImode and SImode.
1447                                            Relative to reg-reg move (2).  */
1448   {8, 8, 8},                            /* cost of storing integer
1449                                            registers.  */
1450   2,                                    /* cost of reg,reg fld/fst.  */
1451   {6, 6, 16},                           /* cost of loading fp registers
1452                                            in SFmode, DFmode and XFmode.  */
1453   {8, 8, 16},                           /* cost of storing fp registers
1454                                            in SFmode, DFmode and XFmode.  */
1455   2,                                    /* cost of moving MMX register.  */
1456   {6, 6},                               /* cost of loading MMX registers
1457                                            in SImode and DImode.  */
1458   {8, 8},                               /* cost of storing MMX registers
1459                                            in SImode and DImode.  */
1460   2, 3, 6,                              /* cost of moving XMM,YMM,ZMM register.  */
1461   {6, 6, 6, 10, 20},                    /* cost of loading SSE registers
1462                                            in 32,64,128,256 and 512-bit.  */
1463   {6, 6, 6, 10, 20},                    /* cost of unaligned loads.  */
1464   {8, 8, 8, 8, 16},                     /* cost of storing SSE registers
1465                                            in 32,64,128,256 and 512-bit.  */
1466   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
1467   6, 6,                                 /* SSE->integer and integer->SSE moves.  */
1468   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1469      throughput 12.  Approx 9 uops do not depend on vector size and every load
1470      is 7 uops.  */
1471   18, 8,                                /* Gather load static, per_elt.  */
1472   18, 10,                               /* Gather store static, per_elt.  */
1473   32,                                   /* size of l1 cache.  */
1474   512,                                  /* size of l2 cache.  */
1475   64,                                   /* size of prefetch block.  */
1476   /* New AMD processors never drop prefetches; if they cannot be performed
1477      immediately, they are queued.  We set number of simultaneous prefetches
1478      to a large constant to reflect this (it probably is not a good idea not
1479      to limit number of prefetches at all, as their execution also takes some
1480      time).  */
1481   100,                                  /* number of parallel prefetches.  */
1482   3,                                    /* Branch cost.  */
1483   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1484   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1485   /* Latency of fdiv is 8-15.  */
1486   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1487   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1488   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1489   /* Latency of fsqrt is 4-10.  */
1490   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1491
1492   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1493   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1494   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1495   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1496   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1497   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1498   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1499   /* 9-13  */
1500   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1501   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1502   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1503   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1504      and it can execute 2 integer additions and 2 multiplications thus
1505      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
1506      that 4 works better than 6 probably due to register pressure.
1507
1508      Integer vector operations are taken by FP unit and execute 3 vector
1509      plus/minus operations per cycle but only one multiply.  This is adjusted
1510      in ix86_reassociation_width.  */
1511   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1512   znver1_memcpy,
1513   znver1_memset,
1514   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1515   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1516 };
1517
1518   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1519      very small blocks it is better to use loop. For large blocks, libcall can
1520      do nontemporary accesses and beat inline considerably.  */
1521 static stringop_algs btver1_memcpy[2] = {
1522   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1523              {-1, rep_prefix_4_byte, false}}},
1524   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1525              {-1, libcall, false}}}};
1526 static stringop_algs btver1_memset[2] = {
1527   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1528              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1529   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1530              {-1, libcall, false}}}};
1531 const struct processor_costs btver1_cost = {
1532   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1533   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1534   COSTS_N_INSNS (1),                    /* variable shift costs */
1535   COSTS_N_INSNS (1),                    /* constant shift costs */
1536   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1537    COSTS_N_INSNS (4),                   /*                               HI */
1538    COSTS_N_INSNS (3),                   /*                               SI */
1539    COSTS_N_INSNS (4),                   /*                               DI */
1540    COSTS_N_INSNS (5)},                  /*                            other */
1541   0,                                    /* cost of multiply per each bit set */
1542   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1543    COSTS_N_INSNS (35),                  /*                          HI */
1544    COSTS_N_INSNS (51),                  /*                          SI */
1545    COSTS_N_INSNS (83),                  /*                          DI */
1546    COSTS_N_INSNS (83)},                 /*                          other */
1547   COSTS_N_INSNS (1),                    /* cost of movsx */
1548   COSTS_N_INSNS (1),                    /* cost of movzx */
1549   8,                                    /* "large" insn */
1550   9,                                    /* MOVE_RATIO */
1551
1552   /* All move costs are relative to integer->integer move times 2 and thus
1553      they are latency*2. */
1554   8,                                 /* cost for loading QImode using movzbl */
1555   {6, 8, 6},                            /* cost of loading integer registers
1556                                            in QImode, HImode and SImode.
1557                                            Relative to reg-reg move (2).  */
1558   {6, 8, 6},                            /* cost of storing integer registers */
1559   4,                                    /* cost of reg,reg fld/fst */
1560   {12, 12, 28},                         /* cost of loading fp registers
1561                                            in SFmode, DFmode and XFmode */
1562   {12, 12, 38},                         /* cost of storing fp registers
1563                                            in SFmode, DFmode and XFmode */
1564   4,                                    /* cost of moving MMX register */
1565   {10, 10},                             /* cost of loading MMX registers
1566                                            in SImode and DImode */
1567   {12, 12},                             /* cost of storing MMX registers
1568                                            in SImode and DImode */
1569   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1570   {10, 10, 12, 24, 48},                 /* cost of loading SSE registers
1571                                            in 32,64,128,256 and 512-bit */
1572   {10, 10, 12, 24, 48},                 /* cost of unaligned loads.  */
1573   {10, 10, 12, 24, 48},                 /* cost of storing SSE registers
1574                                            in 32,64,128,256 and 512-bit */
1575   {10, 10, 12, 24, 48},                 /* cost of unaligned stores.  */
1576   14, 14,                               /* SSE->integer and integer->SSE moves */
1577   10, 10,                               /* Gather load static, per_elt.  */
1578   10, 10,                               /* Gather store static, per_elt.  */
1579   32,                                   /* size of l1 cache.  */
1580   512,                                  /* size of l2 cache.  */
1581   64,                                   /* size of prefetch block */
1582   100,                                  /* number of parallel prefetches */
1583   2,                                    /* Branch cost */
1584   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1585   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1586   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1587   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1588   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1589   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1590
1591   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1592   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1593   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
1594   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1595   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1596   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1597   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
1598   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
1599   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
1600   COSTS_N_INSNS (48),                   /* cost of SQRTSD instruction.  */
1601   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1602   btver1_memcpy,
1603   btver1_memset,
1604   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1605   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1606 };
1607
1608 static stringop_algs btver2_memcpy[2] = {
1609   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1610              {-1, rep_prefix_4_byte, false}}},
1611   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1612              {-1, libcall, false}}}};
1613 static stringop_algs btver2_memset[2] = {
1614   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1615              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1616   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1617              {-1, libcall, false}}}};
1618 const struct processor_costs btver2_cost = {
1619   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1620   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1621   COSTS_N_INSNS (1),                    /* variable shift costs */
1622   COSTS_N_INSNS (1),                    /* constant shift costs */
1623   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1624    COSTS_N_INSNS (4),                   /*                               HI */
1625    COSTS_N_INSNS (3),                   /*                               SI */
1626    COSTS_N_INSNS (4),                   /*                               DI */
1627    COSTS_N_INSNS (5)},                  /*                            other */
1628   0,                                    /* cost of multiply per each bit set */
1629   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1630    COSTS_N_INSNS (35),                  /*                          HI */
1631    COSTS_N_INSNS (51),                  /*                          SI */
1632    COSTS_N_INSNS (83),                  /*                          DI */
1633    COSTS_N_INSNS (83)},                 /*                          other */
1634   COSTS_N_INSNS (1),                    /* cost of movsx */
1635   COSTS_N_INSNS (1),                    /* cost of movzx */
1636   8,                                    /* "large" insn */
1637   9,                                    /* MOVE_RATIO */
1638
1639   /* All move costs are relative to integer->integer move times 2 and thus
1640      they are latency*2. */
1641   8,                                 /* cost for loading QImode using movzbl */
1642   {8, 8, 6},                            /* cost of loading integer registers
1643                                            in QImode, HImode and SImode.
1644                                            Relative to reg-reg move (2).  */
1645   {8, 8, 6},                            /* cost of storing integer registers */
1646   4,                                    /* cost of reg,reg fld/fst */
1647   {12, 12, 28},                         /* cost of loading fp registers
1648                                            in SFmode, DFmode and XFmode */
1649   {12, 12, 38},                         /* cost of storing fp registers
1650                                            in SFmode, DFmode and XFmode */
1651   4,                                    /* cost of moving MMX register */
1652   {10, 10},                             /* cost of loading MMX registers
1653                                            in SImode and DImode */
1654   {12, 12},                             /* cost of storing MMX registers
1655                                            in SImode and DImode */
1656   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1657   {10, 10, 12, 24, 48},                 /* cost of loading SSE registers
1658                                            in 32,64,128,256 and 512-bit */
1659   {10, 10, 12, 24, 48},                 /* cost of unaligned loads.  */
1660   {10, 10, 12, 24, 48},                 /* cost of storing SSE registers
1661                                            in 32,64,128,256 and 512-bit */
1662   {10, 10, 12, 24, 48},                 /* cost of unaligned stores.  */
1663   14, 14,                               /* SSE->integer and integer->SSE moves */
1664   10, 10,                               /* Gather load static, per_elt.  */
1665   10, 10,                               /* Gather store static, per_elt.  */
1666   32,                                   /* size of l1 cache.  */
1667   2048,                                 /* size of l2 cache.  */
1668   64,                                   /* size of prefetch block */
1669   100,                                  /* number of parallel prefetches */
1670   2,                                    /* Branch cost */
1671   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1672   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1673   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1674   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1675   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1676   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1677
1678   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1679   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1680   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
1681   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1682   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1683   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1684   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
1685   COSTS_N_INSNS (19),                   /* cost of DIVSD instruction.  */
1686   COSTS_N_INSNS (16),                   /* cost of SQRTSS instruction.  */
1687   COSTS_N_INSNS (21),                   /* cost of SQRTSD instruction.  */
1688   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1689   btver2_memcpy,
1690   btver2_memset,
1691   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1692   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1693 };
1694
1695 static stringop_algs pentium4_memcpy[2] = {
1696   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1697   DUMMY_STRINGOP_ALGS};
1698 static stringop_algs pentium4_memset[2] = {
1699   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1700              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1701   DUMMY_STRINGOP_ALGS};
1702
1703 static const
1704 struct processor_costs pentium4_cost = {
1705   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1706   COSTS_N_INSNS (3),                    /* cost of a lea instruction */
1707   COSTS_N_INSNS (4),                    /* variable shift costs */
1708   COSTS_N_INSNS (4),                    /* constant shift costs */
1709   {COSTS_N_INSNS (15),                  /* cost of starting multiply for QI */
1710    COSTS_N_INSNS (15),                  /*                               HI */
1711    COSTS_N_INSNS (15),                  /*                               SI */
1712    COSTS_N_INSNS (15),                  /*                               DI */
1713    COSTS_N_INSNS (15)},                 /*                            other */
1714   0,                                    /* cost of multiply per each bit set */
1715   {COSTS_N_INSNS (56),                  /* cost of a divide/mod for QI */
1716    COSTS_N_INSNS (56),                  /*                          HI */
1717    COSTS_N_INSNS (56),                  /*                          SI */
1718    COSTS_N_INSNS (56),                  /*                          DI */
1719    COSTS_N_INSNS (56)},                 /*                          other */
1720   COSTS_N_INSNS (1),                    /* cost of movsx */
1721   COSTS_N_INSNS (1),                    /* cost of movzx */
1722   16,                                   /* "large" insn */
1723   6,                                    /* MOVE_RATIO */
1724
1725   /* All move costs are relative to integer->integer move times 2 and thus
1726      they are latency*2. */
1727   5,                                 /* cost for loading QImode using movzbl */
1728   {4, 5, 4},                            /* cost of loading integer registers
1729                                            in QImode, HImode and SImode.
1730                                            Relative to reg-reg move (2).  */
1731   {2, 3, 2},                            /* cost of storing integer registers */
1732   12,                                   /* cost of reg,reg fld/fst */
1733   {14, 14, 14},                         /* cost of loading fp registers
1734                                            in SFmode, DFmode and XFmode */
1735   {14, 14, 14},                         /* cost of storing fp registers
1736                                            in SFmode, DFmode and XFmode */
1737   12,                                   /* cost of moving MMX register */
1738   {16, 16},                             /* cost of loading MMX registers
1739                                            in SImode and DImode */
1740   {16, 16},                             /* cost of storing MMX registers
1741                                            in SImode and DImode */
1742   12, 24, 48,                           /* cost of moving XMM,YMM,ZMM register */
1743   {16, 16, 16, 32, 64},                 /* cost of loading SSE registers
1744                                            in 32,64,128,256 and 512-bit */
1745   {32, 32, 32, 64, 128},                /* cost of unaligned loads.  */
1746   {16, 16, 16, 32, 64},                 /* cost of storing SSE registers
1747                                            in 32,64,128,256 and 512-bit */
1748   {32, 32, 32, 64, 128},                /* cost of unaligned stores.  */
1749   20, 12,                               /* SSE->integer and integer->SSE moves */
1750   16, 16,                               /* Gather load static, per_elt.  */
1751   16, 16,                               /* Gather store static, per_elt.  */
1752   8,                                    /* size of l1 cache.  */
1753   256,                                  /* size of l2 cache.  */
1754   64,                                   /* size of prefetch block */
1755   6,                                    /* number of parallel prefetches */
1756   2,                                    /* Branch cost */
1757   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1758   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
1759   COSTS_N_INSNS (43),                   /* cost of FDIV instruction.  */
1760   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1761   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1762   COSTS_N_INSNS (43),                   /* cost of FSQRT instruction.  */
1763
1764   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1765   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1766   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1767   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1768   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1769   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1770   COSTS_N_INSNS (23),                   /* cost of DIVSS instruction.  */
1771   COSTS_N_INSNS (38),                   /* cost of DIVSD instruction.  */
1772   COSTS_N_INSNS (23),                   /* cost of SQRTSS instruction.  */
1773   COSTS_N_INSNS (38),                   /* cost of SQRTSD instruction.  */
1774   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1775   pentium4_memcpy,
1776   pentium4_memset,
1777   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1778   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1779 };
1780
1781 static stringop_algs nocona_memcpy[2] = {
1782   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1783   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1784              {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1785
1786 static stringop_algs nocona_memset[2] = {
1787   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1788              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1789   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1790              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1791
1792 static const
1793 struct processor_costs nocona_cost = {
1794   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1795   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1796   COSTS_N_INSNS (1),                    /* variable shift costs */
1797   COSTS_N_INSNS (1),                    /* constant shift costs */
1798   {COSTS_N_INSNS (10),                  /* cost of starting multiply for QI */
1799    COSTS_N_INSNS (10),                  /*                               HI */
1800    COSTS_N_INSNS (10),                  /*                               SI */
1801    COSTS_N_INSNS (10),                  /*                               DI */
1802    COSTS_N_INSNS (10)},                 /*                            other */
1803   0,                                    /* cost of multiply per each bit set */
1804   {COSTS_N_INSNS (66),                  /* cost of a divide/mod for QI */
1805    COSTS_N_INSNS (66),                  /*                          HI */
1806    COSTS_N_INSNS (66),                  /*                          SI */
1807    COSTS_N_INSNS (66),                  /*                          DI */
1808    COSTS_N_INSNS (66)},                 /*                          other */
1809   COSTS_N_INSNS (1),                    /* cost of movsx */
1810   COSTS_N_INSNS (1),                    /* cost of movzx */
1811   16,                                   /* "large" insn */
1812   17,                                   /* MOVE_RATIO */
1813
1814   /* All move costs are relative to integer->integer move times 2 and thus
1815      they are latency*2. */
1816   4,                                 /* cost for loading QImode using movzbl */
1817   {4, 4, 4},                            /* cost of loading integer registers
1818                                            in QImode, HImode and SImode.
1819                                            Relative to reg-reg move (2).  */
1820   {4, 4, 4},                            /* cost of storing integer registers */
1821   12,                                   /* cost of reg,reg fld/fst */
1822   {14, 14, 14},                         /* cost of loading fp registers
1823                                            in SFmode, DFmode and XFmode */
1824   {14, 14, 14},                         /* cost of storing fp registers
1825                                            in SFmode, DFmode and XFmode */
1826   14,                                   /* cost of moving MMX register */
1827   {12, 12},                             /* cost of loading MMX registers
1828                                            in SImode and DImode */
1829   {12, 12},                             /* cost of storing MMX registers
1830                                            in SImode and DImode */
1831   6, 12, 24,                            /* cost of moving XMM,YMM,ZMM register */
1832   {12, 12, 12, 24, 48},                 /* cost of loading SSE registers
1833                                            in 32,64,128,256 and 512-bit */
1834   {24, 24, 24, 48, 96},                 /* cost of unaligned loads.  */
1835   {12, 12, 12, 24, 48},                 /* cost of storing SSE registers
1836                                            in 32,64,128,256 and 512-bit */
1837   {24, 24, 24, 48, 96},                 /* cost of unaligned stores.  */
1838   20, 12,                               /* SSE->integer and integer->SSE moves */
1839   12, 12,                               /* Gather load static, per_elt.  */
1840   12, 12,                               /* Gather store static, per_elt.  */
1841   8,                                    /* size of l1 cache.  */
1842   1024,                                 /* size of l2 cache.  */
1843   64,                                   /* size of prefetch block */
1844   8,                                    /* number of parallel prefetches */
1845   1,                                    /* Branch cost */
1846   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1847   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
1848   COSTS_N_INSNS (40),                   /* cost of FDIV instruction.  */
1849   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
1850   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
1851   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
1852
1853   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1854   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1855   COSTS_N_INSNS (7),                    /* cost of MULSS instruction.  */
1856   COSTS_N_INSNS (7),                    /* cost of MULSD instruction.  */
1857   COSTS_N_INSNS (7),                    /* cost of FMA SS instruction.  */
1858   COSTS_N_INSNS (7),                    /* cost of FMA SD instruction.  */
1859   COSTS_N_INSNS (32),                   /* cost of DIVSS instruction.  */
1860   COSTS_N_INSNS (40),                   /* cost of DIVSD instruction.  */
1861   COSTS_N_INSNS (32),                   /* cost of SQRTSS instruction.  */
1862   COSTS_N_INSNS (41),                   /* cost of SQRTSD instruction.  */
1863   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1864   nocona_memcpy,
1865   nocona_memset,
1866   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1867   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1868 };
1869
1870 static stringop_algs atom_memcpy[2] = {
1871   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1872   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1873              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1874 static stringop_algs atom_memset[2] = {
1875   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1876              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1877   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1878              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1879 static const
1880 struct processor_costs atom_cost = {
1881   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1882   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
1883   COSTS_N_INSNS (1),                    /* variable shift costs */
1884   COSTS_N_INSNS (1),                    /* constant shift costs */
1885   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1886    COSTS_N_INSNS (4),                   /*                               HI */
1887    COSTS_N_INSNS (3),                   /*                               SI */
1888    COSTS_N_INSNS (4),                   /*                               DI */
1889    COSTS_N_INSNS (2)},                  /*                            other */
1890   0,                                    /* cost of multiply per each bit set */
1891   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
1892    COSTS_N_INSNS (26),                  /*                          HI */
1893    COSTS_N_INSNS (42),                  /*                          SI */
1894    COSTS_N_INSNS (74),                  /*                          DI */
1895    COSTS_N_INSNS (74)},                 /*                          other */
1896   COSTS_N_INSNS (1),                    /* cost of movsx */
1897   COSTS_N_INSNS (1),                    /* cost of movzx */
1898   8,                                    /* "large" insn */
1899   17,                                   /* MOVE_RATIO */
1900
1901   /* All move costs are relative to integer->integer move times 2 and thus
1902      they are latency*2. */
1903   6,                                    /* cost for loading QImode using movzbl */
1904   {6, 6, 6},                            /* cost of loading integer registers
1905                                            in QImode, HImode and SImode.
1906                                            Relative to reg-reg move (2).  */
1907   {6, 6, 6},                            /* cost of storing integer registers */
1908   4,                                    /* cost of reg,reg fld/fst */
1909   {6, 6, 18},                           /* cost of loading fp registers
1910                                            in SFmode, DFmode and XFmode */
1911   {14, 14, 24},                         /* cost of storing fp registers
1912                                            in SFmode, DFmode and XFmode */
1913   2,                                    /* cost of moving MMX register */
1914   {8, 8},                               /* cost of loading MMX registers
1915                                            in SImode and DImode */
1916   {10, 10},                             /* cost of storing MMX registers
1917                                            in SImode and DImode */
1918   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1919   {8, 8, 8, 16, 32},                    /* cost of loading SSE registers
1920                                            in 32,64,128,256 and 512-bit */
1921   {16, 16, 16, 32, 64},                 /* cost of unaligned loads.  */
1922   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
1923                                            in 32,64,128,256 and 512-bit */
1924   {16, 16, 16, 32, 64},                 /* cost of unaligned stores.  */
1925   8, 6,                                 /* SSE->integer and integer->SSE moves */
1926   8, 8,                                 /* Gather load static, per_elt.  */
1927   8, 8,                                 /* Gather store static, per_elt.  */
1928   32,                                   /* size of l1 cache.  */
1929   256,                                  /* size of l2 cache.  */
1930   64,                                   /* size of prefetch block */
1931   6,                                    /* number of parallel prefetches */
1932   3,                                    /* Branch cost */
1933   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
1934   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
1935   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
1936   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
1937   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
1938   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
1939
1940   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1941   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1942   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1943   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
1944   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1945   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1946   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
1947   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
1948   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
1949   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
1950   2, 2, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
1951   atom_memcpy,
1952   atom_memset,
1953   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1954   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1955 };
1956
1957 static stringop_algs slm_memcpy[2] = {
1958   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1959   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1960              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1961 static stringop_algs slm_memset[2] = {
1962   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1963              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1964   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1965              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1966 static const
1967 struct processor_costs slm_cost = {
1968   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1969   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
1970   COSTS_N_INSNS (1),                    /* variable shift costs */
1971   COSTS_N_INSNS (1),                    /* constant shift costs */
1972   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1973    COSTS_N_INSNS (3),                   /*                               HI */
1974    COSTS_N_INSNS (3),                   /*                               SI */
1975    COSTS_N_INSNS (4),                   /*                               DI */
1976    COSTS_N_INSNS (2)},                  /*                            other */
1977   0,                                    /* cost of multiply per each bit set */
1978   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
1979    COSTS_N_INSNS (26),                  /*                          HI */
1980    COSTS_N_INSNS (42),                  /*                          SI */
1981    COSTS_N_INSNS (74),                  /*                          DI */
1982    COSTS_N_INSNS (74)},                 /*                          other */
1983   COSTS_N_INSNS (1),                    /* cost of movsx */
1984   COSTS_N_INSNS (1),                    /* cost of movzx */
1985   8,                                    /* "large" insn */
1986   17,                                   /* MOVE_RATIO */
1987
1988   /* All move costs are relative to integer->integer move times 2 and thus
1989      they are latency*2. */
1990   8,                                    /* cost for loading QImode using movzbl */
1991   {8, 8, 8},                            /* cost of loading integer registers
1992                                            in QImode, HImode and SImode.
1993                                            Relative to reg-reg move (2).  */
1994   {6, 6, 6},                            /* cost of storing integer registers */
1995   2,                                    /* cost of reg,reg fld/fst */
1996   {8, 8, 18},                           /* cost of loading fp registers
1997                                            in SFmode, DFmode and XFmode */
1998   {6, 6, 18},                           /* cost of storing fp registers
1999                                            in SFmode, DFmode and XFmode */
2000   2,                                    /* cost of moving MMX register */
2001   {8, 8},                               /* cost of loading MMX registers
2002                                            in SImode and DImode */
2003   {6, 6},                               /* cost of storing MMX registers
2004                                            in SImode and DImode */
2005   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2006   {8, 8, 8, 16, 32},                    /* cost of loading SSE registers
2007                                            in 32,64,128,256 and 512-bit */
2008   {16, 16, 16, 32, 64},                 /* cost of unaligned loads.  */
2009   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
2010                                            in 32,64,128,256 and 512-bit */
2011   {16, 16, 16, 32, 64},                 /* cost of unaligned stores.  */
2012   8, 6,                                 /* SSE->integer and integer->SSE moves */
2013   8, 8,                                 /* Gather load static, per_elt.  */
2014   8, 8,                                 /* Gather store static, per_elt.  */
2015   32,                                   /* size of l1 cache.  */
2016   256,                                  /* size of l2 cache.  */
2017   64,                                   /* size of prefetch block */
2018   6,                                    /* number of parallel prefetches */
2019   3,                                    /* Branch cost */
2020   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2021   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2022   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2023   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2024   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2025   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2026
2027   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2028   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2029   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2030   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2031   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2032   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2033   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
2034   COSTS_N_INSNS (69),                   /* cost of DIVSD instruction.  */
2035   COSTS_N_INSNS (20),                   /* cost of SQRTSS instruction.  */
2036   COSTS_N_INSNS (35),                   /* cost of SQRTSD instruction.  */
2037   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2038   slm_memcpy,
2039   slm_memset,
2040   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2041   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2042 };
2043
2044 static stringop_algs intel_memcpy[2] = {
2045   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2046   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2047              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2048 static stringop_algs intel_memset[2] = {
2049   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2050              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2051   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2052              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2053 static const
2054 struct processor_costs intel_cost = {
2055   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2056   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2057   COSTS_N_INSNS (1),                    /* variable shift costs */
2058   COSTS_N_INSNS (1),                    /* constant shift costs */
2059   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2060    COSTS_N_INSNS (3),                   /*                               HI */
2061    COSTS_N_INSNS (3),                   /*                               SI */
2062    COSTS_N_INSNS (4),                   /*                               DI */
2063    COSTS_N_INSNS (2)},                  /*                            other */
2064   0,                                    /* cost of multiply per each bit set */
2065   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
2066    COSTS_N_INSNS (26),                  /*                          HI */
2067    COSTS_N_INSNS (42),                  /*                          SI */
2068    COSTS_N_INSNS (74),                  /*                          DI */
2069    COSTS_N_INSNS (74)},                 /*                          other */
2070   COSTS_N_INSNS (1),                    /* cost of movsx */
2071   COSTS_N_INSNS (1),                    /* cost of movzx */
2072   8,                                    /* "large" insn */
2073   17,                                   /* MOVE_RATIO */
2074
2075   /* All move costs are relative to integer->integer move times 2 and thus
2076      they are latency*2. */
2077   6,                                 /* cost for loading QImode using movzbl */
2078   {4, 4, 4},                            /* cost of loading integer registers
2079                                            in QImode, HImode and SImode.
2080                                            Relative to reg-reg move (2).  */
2081   {6, 6, 6},                            /* cost of storing integer registers */
2082   2,                                    /* cost of reg,reg fld/fst */
2083   {6, 6, 8},                            /* cost of loading fp registers
2084                                            in SFmode, DFmode and XFmode */
2085   {6, 6, 10},                           /* cost of storing fp registers
2086                                            in SFmode, DFmode and XFmode */
2087   2,                                    /* cost of moving MMX register */
2088   {6, 6},                               /* cost of loading MMX registers
2089                                            in SImode and DImode */
2090   {6, 6},                               /* cost of storing MMX registers
2091                                            in SImode and DImode */
2092   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM register */
2093   {6, 6, 6, 6, 6},                      /* cost of loading SSE registers
2094                                            in 32,64,128,256 and 512-bit */
2095   {10, 10, 10, 10, 10},                 /* cost of unaligned loads.  */
2096   {6, 6, 6, 6, 6},                      /* cost of storing SSE registers
2097                                            in 32,64,128,256 and 512-bit */
2098   {10, 10, 10, 10, 10},                 /* cost of unaligned loads.  */
2099   4, 4,                                 /* SSE->integer and integer->SSE moves */
2100   6, 6,                                 /* Gather load static, per_elt.  */
2101   6, 6,                                 /* Gather store static, per_elt.  */
2102   32,                                   /* size of l1 cache.  */
2103   256,                                  /* size of l2 cache.  */
2104   64,                                   /* size of prefetch block */
2105   6,                                    /* number of parallel prefetches */
2106   3,                                    /* Branch cost */
2107   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2108   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2109   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2110   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2111   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2112   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2113
2114   COSTS_N_INSNS (8),                    /* cost of cheap SSE instruction.  */
2115   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2116   COSTS_N_INSNS (8),                    /* cost of MULSS instruction.  */
2117   COSTS_N_INSNS (8),                    /* cost of MULSD instruction.  */
2118   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2119   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2120   COSTS_N_INSNS (20),                   /* cost of DIVSS instruction.  */
2121   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
2122   COSTS_N_INSNS (40),                   /* cost of SQRTSS instruction.  */
2123   COSTS_N_INSNS (40),                   /* cost of SQRTSD instruction.  */
2124   1, 4, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2125   intel_memcpy,
2126   intel_memset,
2127   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2128   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2129 };
2130
2131 /* Generic should produce code tuned for Core-i7 (and newer chips)
2132    and btver1 (and newer chips).  */
2133
2134 static stringop_algs generic_memcpy[2] = {
2135   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2136              {-1, libcall, false}}},
2137   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2138              {-1, libcall, false}}}};
2139 static stringop_algs generic_memset[2] = {
2140   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2141              {-1, libcall, false}}},
2142   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2143              {-1, libcall, false}}}};
2144 static const
2145 struct processor_costs generic_cost = {
2146   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2147   /* Setting cost to 2 makes our current implementation of synth_mult result in
2148      use of unnecessary temporary registers causing regression on several
2149      SPECfp benchmarks.  */
2150   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2151   COSTS_N_INSNS (1),                    /* variable shift costs */
2152   COSTS_N_INSNS (1),                    /* constant shift costs */
2153   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2154    COSTS_N_INSNS (4),                   /*                               HI */
2155    COSTS_N_INSNS (3),                   /*                               SI */
2156    COSTS_N_INSNS (4),                   /*                               DI */
2157    COSTS_N_INSNS (2)},                  /*                            other */
2158   0,                                    /* cost of multiply per each bit set */
2159   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
2160    COSTS_N_INSNS (26),                  /*                          HI */
2161    COSTS_N_INSNS (42),                  /*                          SI */
2162    COSTS_N_INSNS (74),                  /*                          DI */
2163    COSTS_N_INSNS (74)},                 /*                          other */
2164   COSTS_N_INSNS (1),                    /* cost of movsx */
2165   COSTS_N_INSNS (1),                    /* cost of movzx */
2166   8,                                    /* "large" insn */
2167   17,                                   /* MOVE_RATIO */
2168
2169   /* All move costs are relative to integer->integer move times 2 and thus
2170      they are latency*2. */
2171   4,                                 /* cost for loading QImode using movzbl */
2172   {4, 4, 4},                            /* cost of loading integer registers
2173                                            in QImode, HImode and SImode.
2174                                            Relative to reg-reg move (2).  */
2175   {6, 6, 6},                            /* cost of storing integer registers */
2176   4,                                    /* cost of reg,reg fld/fst */
2177   {6, 6, 12},                           /* cost of loading fp registers
2178                                            in SFmode, DFmode and XFmode */
2179   {6, 6, 12},                           /* cost of storing fp registers
2180                                            in SFmode, DFmode and XFmode */
2181   2,                                    /* cost of moving MMX register */
2182   {6, 6},                               /* cost of loading MMX registers
2183                                            in SImode and DImode */
2184   {6, 6},                               /* cost of storing MMX registers
2185                                            in SImode and DImode */
2186   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
2187   {6, 6, 6, 10, 15},                    /* cost of loading SSE registers
2188                                            in 32,64,128,256 and 512-bit */
2189   {10, 10, 10, 15, 20},                 /* cost of unaligned loads.  */
2190   {6, 6, 6, 10, 15},                    /* cost of storing SSE registers
2191                                            in 32,64,128,256 and 512-bit */
2192   {10, 10, 10, 15, 20},                 /* cost of unaligned storess.  */
2193   20, 20,                               /* SSE->integer and integer->SSE moves */
2194   6, 6,                                 /* Gather load static, per_elt.  */
2195   6, 6,                                 /* Gather store static, per_elt.  */
2196   32,                                   /* size of l1 cache.  */
2197   512,                                  /* size of l2 cache.  */
2198   64,                                   /* size of prefetch block */
2199   6,                                    /* number of parallel prefetches */
2200   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2201      value is increased to perhaps more appropriate value of 5.  */
2202   3,                                    /* Branch cost */
2203   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2204   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
2205   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2206   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2207   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2208   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2209
2210   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2211   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2212   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2213   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2214   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2215   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2216   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
2217   COSTS_N_INSNS (32),                   /* cost of DIVSD instruction.  */
2218   COSTS_N_INSNS (30),                   /* cost of SQRTSS instruction.  */
2219   COSTS_N_INSNS (58),                   /* cost of SQRTSD instruction.  */
2220   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2221   generic_memcpy,
2222   generic_memset,
2223   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2224   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2225 };
2226
2227 /* core_cost should produce code tuned for Core familly of CPUs.  */
2228 static stringop_algs core_memcpy[2] = {
2229   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2230   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2231              {-1, libcall, false}}}};
2232 static stringop_algs core_memset[2] = {
2233   {libcall, {{6, loop_1_byte, true},
2234              {24, loop, true},
2235              {8192, rep_prefix_4_byte, true},
2236              {-1, libcall, false}}},
2237   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2238              {-1, libcall, false}}}};
2239
2240 static const
2241 struct processor_costs core_cost = {
2242   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2243   /* On all chips taken into consideration lea is 2 cycles and more.  With
2244      this cost however our current implementation of synth_mult results in
2245      use of unnecessary temporary registers causing regression on several
2246      SPECfp benchmarks.  */
2247   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2248   COSTS_N_INSNS (1),                    /* variable shift costs */
2249   COSTS_N_INSNS (1),                    /* constant shift costs */
2250   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2251    COSTS_N_INSNS (4),                   /*                               HI */
2252    COSTS_N_INSNS (3),                   /*                               SI */
2253    COSTS_N_INSNS (4),                   /*                               DI */
2254    COSTS_N_INSNS (4)},                  /*                            other */
2255   0,                                    /* cost of multiply per each bit set */
2256   {COSTS_N_INSNS (8),                   /* cost of a divide/mod for QI */
2257    COSTS_N_INSNS (8),                   /*                          HI */
2258    /* 8-11 */
2259    COSTS_N_INSNS (11),                  /*                          SI */
2260    /* 24-81 */
2261    COSTS_N_INSNS (81),                  /*                          DI */
2262    COSTS_N_INSNS (81)},                 /*                          other */
2263   COSTS_N_INSNS (1),                    /* cost of movsx */
2264   COSTS_N_INSNS (1),                    /* cost of movzx */
2265   8,                                    /* "large" insn */
2266   17,                                   /* MOVE_RATIO */
2267
2268   /* All move costs are relative to integer->integer move times 2 and thus
2269      they are latency*2. */
2270   6,                                 /* cost for loading QImode using movzbl */
2271   {4, 4, 4},                            /* cost of loading integer registers
2272                                            in QImode, HImode and SImode.
2273                                            Relative to reg-reg move (2).  */
2274   {6, 6, 6},                            /* cost of storing integer registers */
2275   2,                                    /* cost of reg,reg fld/fst */
2276   {6, 6, 8},                            /* cost of loading fp registers
2277                                            in SFmode, DFmode and XFmode */
2278   {6, 6, 10},                           /* cost of storing fp registers
2279                                            in SFmode, DFmode and XFmode */
2280   2,                                    /* cost of moving MMX register */
2281   {6, 6},                               /* cost of loading MMX registers
2282                                            in SImode and DImode */
2283   {6, 6},                               /* cost of storing MMX registers
2284                                            in SImode and DImode */
2285   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2286   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
2287                                            in 32,64,128,256 and 512-bit */
2288   {6, 6, 6, 6, 12},                     /* cost of unaligned loads.  */
2289   {6, 6, 6, 6, 12},                     /* cost of storing SSE registers
2290                                            in 32,64,128,256 and 512-bit */
2291   {6, 6, 6, 6, 12},                     /* cost of unaligned stores.  */
2292   2, 2,                                 /* SSE->integer and integer->SSE moves */
2293   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2294      rec. throughput 6.
2295      So 5 uops statically and one uops per load.  */
2296   10, 6,                                /* Gather load static, per_elt.  */
2297   10, 6,                                /* Gather store static, per_elt.  */
2298   64,                                   /* size of l1 cache.  */
2299   512,                                  /* size of l2 cache.  */
2300   64,                                   /* size of prefetch block */
2301   6,                                    /* number of parallel prefetches */
2302   /* FIXME perhaps more appropriate value is 5.  */
2303   3,                                    /* Branch cost */
2304   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2305   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
2306   /* 10-24 */
2307   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
2308   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2309   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2310   COSTS_N_INSNS (23),                   /* cost of FSQRT instruction.  */
2311
2312   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2313   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2314   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2315   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2316   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2317   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2318   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
2319   COSTS_N_INSNS (32),                   /* cost of DIVSD instruction.  */
2320   COSTS_N_INSNS (30),                   /* cost of SQRTSS instruction.  */
2321   COSTS_N_INSNS (58),                   /* cost of SQRTSD instruction.  */
2322   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
2323   core_memcpy,
2324   core_memset,
2325   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2326   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2327 };
2328