gcc/config/i386/x86-tune-costs.h

   1 /* Costs of operations of individual x86 CPUs.
   2    Copyright (C) 1988-2018 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3, or (at your option)
   9 any later version.
  10
  11 GCC is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 Under Section 7 of GPL version 3, you are granted additional
  17 permissions described in the GCC Runtime Library Exception, version
  18 3.1, as published by the Free Software Foundation.
  19
  20 You should have received a copy of the GNU General Public License and
  21 a copy of the GCC Runtime Library Exception along with this program;
  22 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23 <http://www.gnu.org/licenses/>.  */
  24 /* Processor costs (relative to an add) */
  25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
  26 #define COSTS_N_BYTES(N) ((N) * 2)
  27
  28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
  29
  30 static stringop_algs ix86_size_memcpy[2] = {
  31   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  32   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  33 static stringop_algs ix86_size_memset[2] = {
  34   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  35   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  36
  37 const
  38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
  39   COSTS_N_BYTES (2),                    /* cost of an add instruction */
  40   COSTS_N_BYTES (3),                    /* cost of a lea instruction */
  41   COSTS_N_BYTES (2),                    /* variable shift costs */
  42   COSTS_N_BYTES (3),                    /* constant shift costs */
  43   {COSTS_N_BYTES (3),                   /* cost of starting multiply for QI */
  44    COSTS_N_BYTES (3),                   /*                               HI */
  45    COSTS_N_BYTES (3),                   /*                               SI */
  46    COSTS_N_BYTES (3),                   /*                               DI */
  47    COSTS_N_BYTES (5)},                  /*                            other */
  48   0,                                    /* cost of multiply per each bit set */
  49   {COSTS_N_BYTES (3),                   /* cost of a divide/mod for QI */
  50    COSTS_N_BYTES (3),                   /*                          HI */
  51    COSTS_N_BYTES (3),                   /*                          SI */
  52    COSTS_N_BYTES (3),                   /*                          DI */
  53    COSTS_N_BYTES (5)},                  /*                          other */
  54   COSTS_N_BYTES (3),                    /* cost of movsx */
  55   COSTS_N_BYTES (3),                    /* cost of movzx */
  56   0,                                    /* "large" insn */
  57   2,                                    /* MOVE_RATIO */
  58
  59   /* All move costs are relative to integer->integer move times 2. */
  60   2,                                 /* cost for loading QImode using movzbl */
  61   {2, 2, 2},                            /* cost of loading integer registers
  62                                            in QImode, HImode and SImode.
  63                                            Relative to reg-reg move (2).  */
  64   {2, 2, 2},                            /* cost of storing integer registers */
  65   2,                                    /* cost of reg,reg fld/fst */
  66   {2, 2, 2},                            /* cost of loading fp registers
  67                                            in SFmode, DFmode and XFmode */
  68   {2, 2, 2},                            /* cost of storing fp registers
  69                                            in SFmode, DFmode and XFmode */
  70   3,                                    /* cost of moving MMX register */
  71   {3, 3},                               /* cost of loading MMX registers
  72                                            in SImode and DImode */
  73   {3, 3},                               /* cost of storing MMX registers
  74                                            in SImode and DImode */
  75   3, 3, 3,                              /* cost of moving XMM,YMM,ZMM register */
  76   {3, 3, 3, 3, 3},                      /* cost of loading SSE registers
  77                                            in 32,64,128,256 and 512-bit */
  78   {3, 3, 3, 3, 3},                      /* cost of unaligned SSE load
  79                                            in 128bit, 256bit and 512bit */
  80   {3, 3, 3, 3, 3},                      /* cost of storing SSE registers
  81                                            in 32,64,128,256 and 512-bit */
  82   {3, 3, 3, 3, 3},                              /* cost of unaligned SSE store
  83                                            in 128bit, 256bit and 512bit */
  84   3, 3,                                 /* SSE->integer and integer->SSE moves */
  85   5, 0,                                 /* Gather load static, per_elt.  */
  86   5, 0,                                 /* Gather store static, per_elt.  */
  87   0,                                    /* size of l1 cache  */
  88   0,                                    /* size of l2 cache  */
  89   0,                                    /* size of prefetch block */
  90   0,                                    /* number of parallel prefetches */
  91   2,                                    /* Branch cost */
  92   COSTS_N_BYTES (2),                    /* cost of FADD and FSUB insns.  */
  93   COSTS_N_BYTES (2),                    /* cost of FMUL instruction.  */
  94   COSTS_N_BYTES (2),                    /* cost of FDIV instruction.  */
  95   COSTS_N_BYTES (2),                    /* cost of FABS instruction.  */
  96   COSTS_N_BYTES (2),                    /* cost of FCHS instruction.  */
  97   COSTS_N_BYTES (2),                    /* cost of FSQRT instruction.  */
  98
  99   COSTS_N_BYTES (2),                    /* cost of cheap SSE instruction.  */
 100   COSTS_N_BYTES (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 101   COSTS_N_BYTES (2),                    /* cost of MULSS instruction.  */
 102   COSTS_N_BYTES (2),                    /* cost of MULSD instruction.  */
 103   COSTS_N_BYTES (2),                    /* cost of FMA SS instruction.  */
 104   COSTS_N_BYTES (2),                    /* cost of FMA SD instruction.  */
 105   COSTS_N_BYTES (2),                    /* cost of DIVSS instruction.  */
 106   COSTS_N_BYTES (2),                    /* cost of DIVSD instruction.  */
 107   COSTS_N_BYTES (2),                    /* cost of SQRTSS instruction.  */
 108   COSTS_N_BYTES (2),                    /* cost of SQRTSD instruction.  */
 109   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 110   ix86_size_memcpy,
 111   ix86_size_memset,
 112   COSTS_N_BYTES (1),                    /* cond_taken_branch_cost.  */
 113   COSTS_N_BYTES (1),                    /* cond_not_taken_branch_cost.  */
 114   NULL,                                 /* Loop alignment.  */
 115   NULL,                                 /* Jump alignment.  */
 116   NULL,                                 /* Label alignment.  */
 117   NULL,                                 /* Func alignment.  */
 118 };
 119
 120 /* Processor costs (relative to an add) */
 121 static stringop_algs i386_memcpy[2] = {
 122   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 123   DUMMY_STRINGOP_ALGS};
 124 static stringop_algs i386_memset[2] = {
 125   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 126   DUMMY_STRINGOP_ALGS};
 127
 128 static const
 129 struct processor_costs i386_cost = {    /* 386 specific costs */
 130   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 131   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 132   COSTS_N_INSNS (3),                    /* variable shift costs */
 133   COSTS_N_INSNS (2),                    /* constant shift costs */
 134   {COSTS_N_INSNS (6),                   /* cost of starting multiply for QI */
 135    COSTS_N_INSNS (6),                   /*                               HI */
 136    COSTS_N_INSNS (6),                   /*                               SI */
 137    COSTS_N_INSNS (6),                   /*                               DI */
 138    COSTS_N_INSNS (6)},                  /*                            other */
 139   COSTS_N_INSNS (1),                    /* cost of multiply per each bit set */
 140   {COSTS_N_INSNS (23),                  /* cost of a divide/mod for QI */
 141    COSTS_N_INSNS (23),                  /*                          HI */
 142    COSTS_N_INSNS (23),                  /*                          SI */
 143    COSTS_N_INSNS (23),                  /*                          DI */
 144    COSTS_N_INSNS (23)},                 /*                          other */
 145   COSTS_N_INSNS (3),                    /* cost of movsx */
 146   COSTS_N_INSNS (2),                    /* cost of movzx */
 147   15,                                   /* "large" insn */
 148   3,                                    /* MOVE_RATIO */
 149
 150   /* All move costs are relative to integer->integer move times 2 and thus
 151      they are latency*2. */
 152   4,                                 /* cost for loading QImode using movzbl */
 153   {2, 4, 2},                            /* cost of loading integer registers
 154                                            in QImode, HImode and SImode.
 155                                            Relative to reg-reg move (2).  */
 156   {2, 4, 2},                            /* cost of storing integer registers */
 157   2,                                    /* cost of reg,reg fld/fst */
 158   {8, 8, 8},                            /* cost of loading fp registers
 159                                            in SFmode, DFmode and XFmode */
 160   {8, 8, 8},                            /* cost of storing fp registers
 161                                            in SFmode, DFmode and XFmode */
 162   2,                                    /* cost of moving MMX register */
 163   {4, 8},                               /* cost of loading MMX registers
 164                                            in SImode and DImode */
 165   {4, 8},                               /* cost of storing MMX registers
 166                                            in SImode and DImode */
 167   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 168   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 169                                            in 32,64,128,256 and 512-bit */
 170   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 171   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 172                                            in 32,64,128,256 and 512-bit */
 173   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 174   3, 3,                                 /* SSE->integer and integer->SSE moves */
 175   4, 4,                                 /* Gather load static, per_elt.  */
 176   4, 4,                                 /* Gather store static, per_elt.  */
 177   0,                                    /* size of l1 cache  */
 178   0,                                    /* size of l2 cache  */
 179   0,                                    /* size of prefetch block */
 180   0,                                    /* number of parallel prefetches */
 181   1,                                    /* Branch cost */
 182   COSTS_N_INSNS (23),                   /* cost of FADD and FSUB insns.  */
 183   COSTS_N_INSNS (27),                   /* cost of FMUL instruction.  */
 184   COSTS_N_INSNS (88),                   /* cost of FDIV instruction.  */
 185   COSTS_N_INSNS (22),                   /* cost of FABS instruction.  */
 186   COSTS_N_INSNS (24),                   /* cost of FCHS instruction.  */
 187   COSTS_N_INSNS (122),                  /* cost of FSQRT instruction.  */
 188
 189   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 190   COSTS_N_INSNS (23),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
 191   COSTS_N_INSNS (27),                   /* cost of MULSS instruction.  */
 192   COSTS_N_INSNS (27),                   /* cost of MULSD instruction.  */
 193   COSTS_N_INSNS (27),                   /* cost of FMA SS instruction.  */
 194   COSTS_N_INSNS (27),                   /* cost of FMA SD instruction.  */
 195   COSTS_N_INSNS (88),                   /* cost of DIVSS instruction.  */
 196   COSTS_N_INSNS (88),                   /* cost of DIVSD instruction.  */
 197   COSTS_N_INSNS (122),                  /* cost of SQRTSS instruction.  */
 198   COSTS_N_INSNS (122),                  /* cost of SQRTSD instruction.  */
 199   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 200   i386_memcpy,
 201   i386_memset,
 202   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 203   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 204   "4",                                  /* Loop alignment.  */
 205   "4",                                  /* Jump alignment.  */
 206   NULL,                                 /* Label alignment.  */
 207   "4",                                  /* Func alignment.  */
 208 };
 209
 210 static stringop_algs i486_memcpy[2] = {
 211   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 212   DUMMY_STRINGOP_ALGS};
 213 static stringop_algs i486_memset[2] = {
 214   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 215   DUMMY_STRINGOP_ALGS};
 216
 217 static const
 218 struct processor_costs i486_cost = {    /* 486 specific costs */
 219   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 220   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 221   COSTS_N_INSNS (3),                    /* variable shift costs */
 222   COSTS_N_INSNS (2),                    /* constant shift costs */
 223   {COSTS_N_INSNS (12),                  /* cost of starting multiply for QI */
 224    COSTS_N_INSNS (12),                  /*                               HI */
 225    COSTS_N_INSNS (12),                  /*                               SI */
 226    COSTS_N_INSNS (12),                  /*                               DI */
 227    COSTS_N_INSNS (12)},                 /*                            other */
 228   1,                                    /* cost of multiply per each bit set */
 229   {COSTS_N_INSNS (40),                  /* cost of a divide/mod for QI */
 230    COSTS_N_INSNS (40),                  /*                          HI */
 231    COSTS_N_INSNS (40),                  /*                          SI */
 232    COSTS_N_INSNS (40),                  /*                          DI */
 233    COSTS_N_INSNS (40)},                 /*                          other */
 234   COSTS_N_INSNS (3),                    /* cost of movsx */
 235   COSTS_N_INSNS (2),                    /* cost of movzx */
 236   15,                                   /* "large" insn */
 237   3,                                    /* MOVE_RATIO */
 238
 239   /* All move costs are relative to integer->integer move times 2 and thus
 240      they are latency*2. */
 241   4,                                 /* cost for loading QImode using movzbl */
 242   {2, 4, 2},                            /* cost of loading integer registers
 243                                            in QImode, HImode and SImode.
 244                                            Relative to reg-reg move (2).  */
 245   {2, 4, 2},                            /* cost of storing integer registers */
 246   2,                                    /* cost of reg,reg fld/fst */
 247   {8, 8, 8},                            /* cost of loading fp registers
 248                                            in SFmode, DFmode and XFmode */
 249   {8, 8, 8},                            /* cost of storing fp registers
 250                                            in SFmode, DFmode and XFmode */
 251   2,                                    /* cost of moving MMX register */
 252   {4, 8},                               /* cost of loading MMX registers
 253                                            in SImode and DImode */
 254   {4, 8},                               /* cost of storing MMX registers
 255                                            in SImode and DImode */
 256   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 257   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 258                                            in 32,64,128,256 and 512-bit */
 259   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 260   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 261                                            in 32,64,128,256 and 512-bit */
 262   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 263   3, 3,                                 /* SSE->integer and integer->SSE moves */
 264   4, 4,                                 /* Gather load static, per_elt.  */
 265   4, 4,                                 /* Gather store static, per_elt.  */
 266   4,                                    /* size of l1 cache.  486 has 8kB cache
 267                                            shared for code and data, so 4kB is
 268                                            not really precise.  */
 269   4,                                    /* size of l2 cache  */
 270   0,                                    /* size of prefetch block */
 271   0,                                    /* number of parallel prefetches */
 272   1,                                    /* Branch cost */
 273   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
 274   COSTS_N_INSNS (16),                   /* cost of FMUL instruction.  */
 275   COSTS_N_INSNS (73),                   /* cost of FDIV instruction.  */
 276   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
 277   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
 278   COSTS_N_INSNS (83),                   /* cost of FSQRT instruction.  */
 279
 280   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 281   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 282   COSTS_N_INSNS (16),                   /* cost of MULSS instruction.  */
 283   COSTS_N_INSNS (16),                   /* cost of MULSD instruction.  */
 284   COSTS_N_INSNS (16),                   /* cost of FMA SS instruction.  */
 285   COSTS_N_INSNS (16),                   /* cost of FMA SD instruction.  */
 286   COSTS_N_INSNS (73),                   /* cost of DIVSS instruction.  */
 287   COSTS_N_INSNS (74),                   /* cost of DIVSD instruction.  */
 288   COSTS_N_INSNS (83),                   /* cost of SQRTSS instruction.  */
 289   COSTS_N_INSNS (83),                   /* cost of SQRTSD instruction.  */
 290   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 291   i486_memcpy,
 292   i486_memset,
 293   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 294   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 295   "16",                                 /* Loop alignment.  */
 296   "16",                                 /* Jump alignment.  */
 297   "0:0:8",                              /* Label alignment.  */
 298   "16",                                 /* Func alignment.  */
 299 };
 300
 301 static stringop_algs pentium_memcpy[2] = {
 302   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 303   DUMMY_STRINGOP_ALGS};
 304 static stringop_algs pentium_memset[2] = {
 305   {libcall, {{-1, rep_prefix_4_byte, false}}},
 306   DUMMY_STRINGOP_ALGS};
 307
 308 static const
 309 struct processor_costs pentium_cost = {
 310   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 311   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 312   COSTS_N_INSNS (4),                    /* variable shift costs */
 313   COSTS_N_INSNS (1),                    /* constant shift costs */
 314   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 315    COSTS_N_INSNS (11),                  /*                               HI */
 316    COSTS_N_INSNS (11),                  /*                               SI */
 317    COSTS_N_INSNS (11),                  /*                               DI */
 318    COSTS_N_INSNS (11)},                 /*                            other */
 319   0,                                    /* cost of multiply per each bit set */
 320   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 321    COSTS_N_INSNS (25),                  /*                          HI */
 322    COSTS_N_INSNS (25),                  /*                          SI */
 323    COSTS_N_INSNS (25),                  /*                          DI */
 324    COSTS_N_INSNS (25)},                 /*                          other */
 325   COSTS_N_INSNS (3),                    /* cost of movsx */
 326   COSTS_N_INSNS (2),                    /* cost of movzx */
 327   8,                                    /* "large" insn */
 328   6,                                    /* MOVE_RATIO */
 329
 330   /* All move costs are relative to integer->integer move times 2 and thus
 331      they are latency*2. */
 332   6,                                 /* cost for loading QImode using movzbl */
 333   {2, 4, 2},                            /* cost of loading integer registers
 334                                            in QImode, HImode and SImode.
 335                                            Relative to reg-reg move (2).  */
 336   {2, 4, 2},                            /* cost of storing integer registers */
 337   2,                                    /* cost of reg,reg fld/fst */
 338   {2, 2, 6},                            /* cost of loading fp registers
 339                                            in SFmode, DFmode and XFmode */
 340   {4, 4, 6},                            /* cost of storing fp registers
 341                                            in SFmode, DFmode and XFmode */
 342   8,                                    /* cost of moving MMX register */
 343   {8, 8},                               /* cost of loading MMX registers
 344                                            in SImode and DImode */
 345   {8, 8},                               /* cost of storing MMX registers
 346                                            in SImode and DImode */
 347   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 348   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 349                                            in 32,64,128,256 and 512-bit */
 350   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 351   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 352                                            in 32,64,128,256 and 512-bit */
 353   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 354   3, 3,                                 /* SSE->integer and integer->SSE moves */
 355   4, 4,                                 /* Gather load static, per_elt.  */
 356   4, 4,                                 /* Gather store static, per_elt.  */
 357   8,                                    /* size of l1 cache.  */
 358   8,                                    /* size of l2 cache  */
 359   0,                                    /* size of prefetch block */
 360   0,                                    /* number of parallel prefetches */
 361   2,                                    /* Branch cost */
 362   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 363   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 364   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 365   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 366   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 367   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 368
 369   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 370   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 371   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
 372   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
 373   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
 374   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
 375   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
 376   COSTS_N_INSNS (39),                   /* cost of DIVSD instruction.  */
 377   COSTS_N_INSNS (70),                   /* cost of SQRTSS instruction.  */
 378   COSTS_N_INSNS (70),                   /* cost of SQRTSD instruction.  */
 379   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 380   pentium_memcpy,
 381   pentium_memset,
 382   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 383   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 384   "16:8:8",                             /* Loop alignment.  */
 385   "16:8:8",                             /* Jump alignment.  */
 386   "0:0:8",                              /* Label alignment.  */
 387   "16",                                 /* Func alignment.  */
 388 };
 389
 390 static const
 391 struct processor_costs lakemont_cost = {
 392   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 393   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
 394   COSTS_N_INSNS (1),                    /* variable shift costs */
 395   COSTS_N_INSNS (1),                    /* constant shift costs */
 396   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 397    COSTS_N_INSNS (11),                  /*                               HI */
 398    COSTS_N_INSNS (11),                  /*                               SI */
 399    COSTS_N_INSNS (11),                  /*                               DI */
 400    COSTS_N_INSNS (11)},                 /*                            other */
 401   0,                                    /* cost of multiply per each bit set */
 402   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 403    COSTS_N_INSNS (25),                  /*                          HI */
 404    COSTS_N_INSNS (25),                  /*                          SI */
 405    COSTS_N_INSNS (25),                  /*                          DI */
 406    COSTS_N_INSNS (25)},                 /*                          other */
 407   COSTS_N_INSNS (3),                    /* cost of movsx */
 408   COSTS_N_INSNS (2),                    /* cost of movzx */
 409   8,                                    /* "large" insn */
 410   17,                                   /* MOVE_RATIO */
 411
 412   /* All move costs are relative to integer->integer move times 2 and thus
 413      they are latency*2. */
 414   6,                                 /* cost for loading QImode using movzbl */
 415   {2, 4, 2},                            /* cost of loading integer registers
 416                                            in QImode, HImode and SImode.
 417                                            Relative to reg-reg move (2).  */
 418   {2, 4, 2},                            /* cost of storing integer registers */
 419   2,                                    /* cost of reg,reg fld/fst */
 420   {2, 2, 6},                            /* cost of loading fp registers
 421                                            in SFmode, DFmode and XFmode */
 422   {4, 4, 6},                            /* cost of storing fp registers
 423                                            in SFmode, DFmode and XFmode */
 424   8,                                    /* cost of moving MMX register */
 425   {8, 8},                               /* cost of loading MMX registers
 426                                            in SImode and DImode */
 427   {8, 8},                               /* cost of storing MMX registers
 428                                            in SImode and DImode */
 429   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 430   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 431                                            in 32,64,128,256 and 512-bit */
 432   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 433   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 434                                            in 32,64,128,256 and 512-bit */
 435   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 436   3, 3,                                 /* SSE->integer and integer->SSE moves */
 437   4, 4,                                 /* Gather load static, per_elt.  */
 438   4, 4,                                 /* Gather store static, per_elt.  */
 439   8,                                    /* size of l1 cache.  */
 440   8,                                    /* size of l2 cache  */
 441   0,                                    /* size of prefetch block */
 442   0,                                    /* number of parallel prefetches */
 443   2,                                    /* Branch cost */
 444   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 445   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 446   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 447   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 448   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 449   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 450
 451   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 452   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 453   COSTS_N_INSNS (5),                    /* cost of MULSS instruction.  */
 454   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
 455   COSTS_N_INSNS (10),                   /* cost of FMA SS instruction.  */
 456   COSTS_N_INSNS (10),                   /* cost of FMA SD instruction.  */
 457   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
 458   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
 459   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 460   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
 461   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 462   pentium_memcpy,
 463   pentium_memset,
 464   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 465   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 466   "16:8:8",                             /* Loop alignment.  */
 467   "16:8:8",                             /* Jump alignment.  */
 468   "0:0:8",                              /* Label alignment.  */
 469   "16",                                 /* Func alignment.  */
 470 };
 471
 472 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
 473    (we ensure the alignment).  For small blocks inline loop is still a
 474    noticeable win, for bigger blocks either rep movsl or rep movsb is
 475    way to go.  Rep movsb has apparently more expensive startup time in CPU,
 476    but after 4K the difference is down in the noise.  */
 477 static stringop_algs pentiumpro_memcpy[2] = {
 478   {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
 479                        {8192, rep_prefix_4_byte, false},
 480                        {-1, rep_prefix_1_byte, false}}},
 481   DUMMY_STRINGOP_ALGS};
 482 static stringop_algs pentiumpro_memset[2] = {
 483   {rep_prefix_4_byte, {{1024, unrolled_loop, false},
 484                        {8192, rep_prefix_4_byte, false},
 485                        {-1, libcall, false}}},
 486   DUMMY_STRINGOP_ALGS};
 487 static const
 488 struct processor_costs pentiumpro_cost = {
 489   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 490   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 491   COSTS_N_INSNS (1),                    /* variable shift costs */
 492   COSTS_N_INSNS (1),                    /* constant shift costs */
 493   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
 494    COSTS_N_INSNS (4),                   /*                               HI */
 495    COSTS_N_INSNS (4),                   /*                               SI */
 496    COSTS_N_INSNS (4),                   /*                               DI */
 497    COSTS_N_INSNS (4)},                  /*                            other */
 498   0,                                    /* cost of multiply per each bit set */
 499   {COSTS_N_INSNS (17),                  /* cost of a divide/mod for QI */
 500    COSTS_N_INSNS (17),                  /*                          HI */
 501    COSTS_N_INSNS (17),                  /*                          SI */
 502    COSTS_N_INSNS (17),                  /*                          DI */
 503    COSTS_N_INSNS (17)},                 /*                          other */
 504   COSTS_N_INSNS (1),                    /* cost of movsx */
 505   COSTS_N_INSNS (1),                    /* cost of movzx */
 506   8,                                    /* "large" insn */
 507   6,                                    /* MOVE_RATIO */
 508
 509   /* All move costs are relative to integer->integer move times 2 and thus
 510      they are latency*2. */
 511   2,                                 /* cost for loading QImode using movzbl */
 512   {4, 4, 4},                            /* cost of loading integer registers
 513                                            in QImode, HImode and SImode.
 514                                            Relative to reg-reg move (2).  */
 515   {2, 2, 2},                            /* cost of storing integer registers */
 516   2,                                    /* cost of reg,reg fld/fst */
 517   {2, 2, 6},                            /* cost of loading fp registers
 518                                            in SFmode, DFmode and XFmode */
 519   {4, 4, 6},                            /* cost of storing fp registers
 520                                            in SFmode, DFmode and XFmode */
 521   2,                                    /* cost of moving MMX register */
 522   {2, 2},                               /* cost of loading MMX registers
 523                                            in SImode and DImode */
 524   {2, 2},                               /* cost of storing MMX registers
 525                                            in SImode and DImode */
 526   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 527   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 528                                            in 32,64,128,256 and 512-bit */
 529   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 530   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 531                                            in 32,64,128,256 and 512-bit */
 532   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 533   3, 3,                                 /* SSE->integer and integer->SSE moves */
 534   4, 4,                                 /* Gather load static, per_elt.  */
 535   4, 4,                                 /* Gather store static, per_elt.  */
 536   8,                                    /* size of l1 cache.  */
 537   256,                                  /* size of l2 cache  */
 538   32,                                   /* size of prefetch block */
 539   6,                                    /* number of parallel prefetches */
 540   2,                                    /* Branch cost */
 541   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 542   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
 543   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 544   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 545   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 546   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 547
 548   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 549   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 550   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 551   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 552   COSTS_N_INSNS (7),                    /* cost of FMA SS instruction.  */
 553   COSTS_N_INSNS (7),                    /* cost of FMA SD instruction.  */
 554   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
 555   COSTS_N_INSNS (18),                   /* cost of DIVSD instruction.  */
 556   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 557   COSTS_N_INSNS (31),                   /* cost of SQRTSD instruction.  */
 558   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 559   pentiumpro_memcpy,
 560   pentiumpro_memset,
 561   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 562   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 563   "16",                                 /* Loop alignment.  */
 564   "16:11:8",                            /* Jump alignment.  */
 565   "0:0:8",                              /* Label alignment.  */
 566   "16",                                 /* Func alignment.  */
 567 };
 568
 569 static stringop_algs geode_memcpy[2] = {
 570   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 571   DUMMY_STRINGOP_ALGS};
 572 static stringop_algs geode_memset[2] = {
 573   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 574   DUMMY_STRINGOP_ALGS};
 575 static const
 576 struct processor_costs geode_cost = {
 577   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 578   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 579   COSTS_N_INSNS (2),                    /* variable shift costs */
 580   COSTS_N_INSNS (1),                    /* constant shift costs */
 581   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 582    COSTS_N_INSNS (4),                   /*                               HI */
 583    COSTS_N_INSNS (7),                   /*                               SI */
 584    COSTS_N_INSNS (7),                   /*                               DI */
 585    COSTS_N_INSNS (7)},                  /*                            other */
 586   0,                                    /* cost of multiply per each bit set */
 587   {COSTS_N_INSNS (15),                  /* cost of a divide/mod for QI */
 588    COSTS_N_INSNS (23),                  /*                          HI */
 589    COSTS_N_INSNS (39),                  /*                          SI */
 590    COSTS_N_INSNS (39),                  /*                          DI */
 591    COSTS_N_INSNS (39)},                 /*                          other */
 592   COSTS_N_INSNS (1),                    /* cost of movsx */
 593   COSTS_N_INSNS (1),                    /* cost of movzx */
 594   8,                                    /* "large" insn */
 595   4,                                    /* MOVE_RATIO */
 596
 597   /* All move costs are relative to integer->integer move times 2 and thus
 598      they are latency*2. */
 599   2,                                 /* cost for loading QImode using movzbl */
 600   {2, 2, 2},                            /* cost of loading integer registers
 601                                            in QImode, HImode and SImode.
 602                                            Relative to reg-reg move (2).  */
 603   {2, 2, 2},                            /* cost of storing integer registers */
 604   2,                                    /* cost of reg,reg fld/fst */
 605   {2, 2, 2},                            /* cost of loading fp registers
 606                                            in SFmode, DFmode and XFmode */
 607   {4, 6, 6},                            /* cost of storing fp registers
 608                                            in SFmode, DFmode and XFmode */
 609
 610   2,                                    /* cost of moving MMX register */
 611   {2, 2},                               /* cost of loading MMX registers
 612                                            in SImode and DImode */
 613   {2, 2},                               /* cost of storing MMX registers
 614                                            in SImode and DImode */
 615   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 616   {2, 2, 8, 16, 32},                    /* cost of loading SSE registers
 617                                            in 32,64,128,256 and 512-bit */
 618   {2, 2, 8, 16, 32},                    /* cost of unaligned loads.  */
 619   {2, 2, 8, 16, 32},                    /* cost of storing SSE registers
 620                                            in 32,64,128,256 and 512-bit */
 621   {2, 2, 8, 16, 32},                    /* cost of unaligned stores.  */
 622   6, 6,                                 /* SSE->integer and integer->SSE moves */
 623   2, 2,                                 /* Gather load static, per_elt.  */
 624   2, 2,                                 /* Gather store static, per_elt.  */
 625   64,                                   /* size of l1 cache.  */
 626   128,                                  /* size of l2 cache.  */
 627   32,                                   /* size of prefetch block */
 628   1,                                    /* number of parallel prefetches */
 629   1,                                    /* Branch cost */
 630   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
 631   COSTS_N_INSNS (11),                   /* cost of FMUL instruction.  */
 632   COSTS_N_INSNS (47),                   /* cost of FDIV instruction.  */
 633   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 634   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 635   COSTS_N_INSNS (54),                   /* cost of FSQRT instruction.  */
 636
 637   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 638   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 639   COSTS_N_INSNS (11),                   /* cost of MULSS instruction.  */
 640   COSTS_N_INSNS (11),                   /* cost of MULSD instruction.  */
 641   COSTS_N_INSNS (17),                   /* cost of FMA SS instruction.  */
 642   COSTS_N_INSNS (17),                   /* cost of FMA SD instruction.  */
 643   COSTS_N_INSNS (47),                   /* cost of DIVSS instruction.  */
 644   COSTS_N_INSNS (47),                   /* cost of DIVSD instruction.  */
 645   COSTS_N_INSNS (54),                   /* cost of SQRTSS instruction.  */
 646   COSTS_N_INSNS (54),                   /* cost of SQRTSD instruction.  */
 647   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 648   geode_memcpy,
 649   geode_memset,
 650   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 651   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 652   NULL,                                 /* Loop alignment.  */
 653   NULL,                                 /* Jump alignment.  */
 654   NULL,                                 /* Label alignment.  */
 655   NULL,                                 /* Func alignment.  */
 656 };
 657
 658 static stringop_algs k6_memcpy[2] = {
 659   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 660   DUMMY_STRINGOP_ALGS};
 661 static stringop_algs k6_memset[2] = {
 662   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 663   DUMMY_STRINGOP_ALGS};
 664 static const
 665 struct processor_costs k6_cost = {
 666   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 667   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 668   COSTS_N_INSNS (1),                    /* variable shift costs */
 669   COSTS_N_INSNS (1),                    /* constant shift costs */
 670   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 671    COSTS_N_INSNS (3),                   /*                               HI */
 672    COSTS_N_INSNS (3),                   /*                               SI */
 673    COSTS_N_INSNS (3),                   /*                               DI */
 674    COSTS_N_INSNS (3)},                  /*                            other */
 675   0,                                    /* cost of multiply per each bit set */
 676   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 677    COSTS_N_INSNS (18),                  /*                          HI */
 678    COSTS_N_INSNS (18),                  /*                          SI */
 679    COSTS_N_INSNS (18),                  /*                          DI */
 680    COSTS_N_INSNS (18)},                 /*                          other */
 681   COSTS_N_INSNS (2),                    /* cost of movsx */
 682   COSTS_N_INSNS (2),                    /* cost of movzx */
 683   8,                                    /* "large" insn */
 684   4,                                    /* MOVE_RATIO */
 685
 686   /* All move costs are relative to integer->integer move times 2 and thus
 687      they are latency*2. */
 688   3,                                 /* cost for loading QImode using movzbl */
 689   {4, 5, 4},                            /* cost of loading integer registers
 690                                            in QImode, HImode and SImode.
 691                                            Relative to reg-reg move (2).  */
 692   {2, 3, 2},                            /* cost of storing integer registers */
 693   4,                                    /* cost of reg,reg fld/fst */
 694   {6, 6, 6},                            /* cost of loading fp registers
 695                                            in SFmode, DFmode and XFmode */
 696   {4, 4, 4},                            /* cost of storing fp registers
 697                                            in SFmode, DFmode and XFmode */
 698   2,                                    /* cost of moving MMX register */
 699   {2, 2},                               /* cost of loading MMX registers
 700                                            in SImode and DImode */
 701   {2, 2},                               /* cost of storing MMX registers
 702                                            in SImode and DImode */
 703   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 704   {2, 2, 8, 16, 32},                    /* cost of loading SSE registers
 705                                            in 32,64,128,256 and 512-bit */
 706   {2, 2, 8, 16, 32},                    /* cost of unaligned loads.  */
 707   {2, 2, 8, 16, 32},                    /* cost of storing SSE registers
 708                                            in 32,64,128,256 and 512-bit */
 709   {2, 2, 8, 16, 32},                    /* cost of unaligned stores.  */
 710   6, 6,                                 /* SSE->integer and integer->SSE moves */
 711   2, 2,                                 /* Gather load static, per_elt.  */
 712   2, 2,                                 /* Gather store static, per_elt.  */
 713   32,                                   /* size of l1 cache.  */
 714   32,                                   /* size of l2 cache.  Some models
 715                                            have integrated l2 cache, but
 716                                            optimizing for k6 is not important
 717                                            enough to worry about that.  */
 718   32,                                   /* size of prefetch block */
 719   1,                                    /* number of parallel prefetches */
 720   1,                                    /* Branch cost */
 721   COSTS_N_INSNS (2),                    /* cost of FADD and FSUB insns.  */
 722   COSTS_N_INSNS (2),                    /* cost of FMUL instruction.  */
 723   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 724   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 725   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 726   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 727
 728   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 729   COSTS_N_INSNS (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 730   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
 731   COSTS_N_INSNS (2),                    /* cost of MULSD instruction.  */
 732   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
 733   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
 734   COSTS_N_INSNS (56),                   /* cost of DIVSS instruction.  */
 735   COSTS_N_INSNS (56),                   /* cost of DIVSD instruction.  */
 736   COSTS_N_INSNS (56),                   /* cost of SQRTSS instruction.  */
 737   COSTS_N_INSNS (56),                   /* cost of SQRTSD instruction.  */
 738   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 739   k6_memcpy,
 740   k6_memset,
 741   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 742   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 743   "32:8:8",                             /* Loop alignment.  */
 744   "32:8:8",                             /* Jump alignment.  */
 745   "0:0:8",                              /* Label alignment.  */
 746   "32",                                 /* Func alignment.  */
 747 };
 748
 749 /* For some reason, Athlon deals better with REP prefix (relative to loops)
 750    compared to K8. Alignment becomes important after 8 bytes for memcpy and
 751    128 bytes for memset.  */
 752 static stringop_algs athlon_memcpy[2] = {
 753   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 754   DUMMY_STRINGOP_ALGS};
 755 static stringop_algs athlon_memset[2] = {
 756   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 757   DUMMY_STRINGOP_ALGS};
 758 static const
 759 struct processor_costs athlon_cost = {
 760   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 761   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 762   COSTS_N_INSNS (1),                    /* variable shift costs */
 763   COSTS_N_INSNS (1),                    /* constant shift costs */
 764   {COSTS_N_INSNS (5),                   /* cost of starting multiply for QI */
 765    COSTS_N_INSNS (5),                   /*                               HI */
 766    COSTS_N_INSNS (5),                   /*                               SI */
 767    COSTS_N_INSNS (5),                   /*                               DI */
 768    COSTS_N_INSNS (5)},                  /*                            other */
 769   0,                                    /* cost of multiply per each bit set */
 770   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 771    COSTS_N_INSNS (26),                  /*                          HI */
 772    COSTS_N_INSNS (42),                  /*                          SI */
 773    COSTS_N_INSNS (74),                  /*                          DI */
 774    COSTS_N_INSNS (74)},                 /*                          other */
 775   COSTS_N_INSNS (1),                    /* cost of movsx */
 776   COSTS_N_INSNS (1),                    /* cost of movzx */
 777   8,                                    /* "large" insn */
 778   9,                                    /* MOVE_RATIO */
 779
 780   /* All move costs are relative to integer->integer move times 2 and thus
 781      they are latency*2. */
 782   4,                                 /* cost for loading QImode using movzbl */
 783   {3, 4, 3},                            /* cost of loading integer registers
 784                                            in QImode, HImode and SImode.
 785                                            Relative to reg-reg move (2).  */
 786   {3, 4, 3},                            /* cost of storing integer registers */
 787   4,                                    /* cost of reg,reg fld/fst */
 788   {4, 4, 12},                           /* cost of loading fp registers
 789                                            in SFmode, DFmode and XFmode */
 790   {6, 6, 8},                            /* cost of storing fp registers
 791                                            in SFmode, DFmode and XFmode */
 792   2,                                    /* cost of moving MMX register */
 793   {4, 4},                               /* cost of loading MMX registers
 794                                            in SImode and DImode */
 795   {4, 4},                               /* cost of storing MMX registers
 796                                            in SImode and DImode */
 797   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 798   {4, 4, 6, 12, 24},                    /* cost of loading SSE registers
 799                                            in 32,64,128,256 and 512-bit */
 800   {4, 4, 6, 12, 24},                    /* cost of unaligned loads.  */
 801   {4, 4, 5, 10, 20},                    /* cost of storing SSE registers
 802                                            in 32,64,128,256 and 512-bit */
 803   {4, 4, 5, 10, 20},                    /* cost of unaligned stores.  */
 804   5, 5,                                 /* SSE->integer and integer->SSE moves */
 805   4, 4,                                 /* Gather load static, per_elt.  */
 806   4, 4,                                 /* Gather store static, per_elt.  */
 807   64,                                   /* size of l1 cache.  */
 808   256,                                  /* size of l2 cache.  */
 809   64,                                   /* size of prefetch block */
 810   6,                                    /* number of parallel prefetches */
 811   5,                                    /* Branch cost */
 812   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 813   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 814   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
 815   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 816   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 817   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 818
 819   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
 820   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 821   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 822   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 823   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
 824   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
 825   /* 11-16  */
 826   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
 827   COSTS_N_INSNS (24),                   /* cost of DIVSD instruction.  */
 828   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
 829   COSTS_N_INSNS (19),                   /* cost of SQRTSD instruction.  */
 830   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 831   athlon_memcpy,
 832   athlon_memset,
 833   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 834   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 835   "16:8:8",                             /* Loop alignment.  */
 836   "16:8:8",                             /* Jump alignment.  */
 837   "0:0:8",                              /* Label alignment.  */
 838   "16",                                 /* Func alignment.  */
 839 };
 840
 841 /* K8 has optimized REP instruction for medium sized blocks, but for very
 842    small blocks it is better to use loop. For large blocks, libcall can
 843    do nontemporary accesses and beat inline considerably.  */
 844 static stringop_algs k8_memcpy[2] = {
 845   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
 846              {-1, rep_prefix_4_byte, false}}},
 847   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
 848              {-1, libcall, false}}}};
 849 static stringop_algs k8_memset[2] = {
 850   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 851              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 852   {libcall, {{48, unrolled_loop, false},
 853              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 854 static const
 855 struct processor_costs k8_cost = {
 856   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 857   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 858   COSTS_N_INSNS (1),                    /* variable shift costs */
 859   COSTS_N_INSNS (1),                    /* constant shift costs */
 860   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 861    COSTS_N_INSNS (4),                   /*                               HI */
 862    COSTS_N_INSNS (3),                   /*                               SI */
 863    COSTS_N_INSNS (4),                   /*                               DI */
 864    COSTS_N_INSNS (5)},                  /*                            other */
 865   0,                                    /* cost of multiply per each bit set */
 866   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 867    COSTS_N_INSNS (26),                  /*                          HI */
 868    COSTS_N_INSNS (42),                  /*                          SI */
 869    COSTS_N_INSNS (74),                  /*                          DI */
 870    COSTS_N_INSNS (74)},                 /*                          other */
 871   COSTS_N_INSNS (1),                    /* cost of movsx */
 872   COSTS_N_INSNS (1),                    /* cost of movzx */
 873   8,                                    /* "large" insn */
 874   9,                                    /* MOVE_RATIO */
 875
 876   /* All move costs are relative to integer->integer move times 2 and thus
 877      they are latency*2. */
 878   4,                                 /* cost for loading QImode using movzbl */
 879   {3, 4, 3},                            /* cost of loading integer registers
 880                                            in QImode, HImode and SImode.
 881                                            Relative to reg-reg move (2).  */
 882   {3, 4, 3},                            /* cost of storing integer registers */
 883   4,                                    /* cost of reg,reg fld/fst */
 884   {4, 4, 12},                           /* cost of loading fp registers
 885                                            in SFmode, DFmode and XFmode */
 886   {6, 6, 8},                            /* cost of storing fp registers
 887                                            in SFmode, DFmode and XFmode */
 888   2,                                    /* cost of moving MMX register */
 889   {3, 3},                               /* cost of loading MMX registers
 890                                            in SImode and DImode */
 891   {4, 4},                               /* cost of storing MMX registers
 892                                            in SImode and DImode */
 893   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 894   {4, 3, 6, 12, 24},                    /* cost of loading SSE registers
 895                                            in 32,64,128,256 and 512-bit */
 896   {4, 3, 6, 12, 24},                    /* cost of unaligned loads.  */
 897   {4, 4, 5, 10, 20},                    /* cost of storing SSE registers
 898                                            in 32,64,128,256 and 512-bit */
 899   {4, 4, 5, 10, 20},                    /* cost of unaligned stores.  */
 900   5, 5,                                 /* SSE->integer and integer->SSE moves */
 901   4, 4,                                 /* Gather load static, per_elt.  */
 902   4, 4,                                 /* Gather store static, per_elt.  */
 903   64,                                   /* size of l1 cache.  */
 904   512,                                  /* size of l2 cache.  */
 905   64,                                   /* size of prefetch block */
 906   /* New AMD processors never drop prefetches; if they cannot be performed
 907      immediately, they are queued.  We set number of simultaneous prefetches
 908      to a large constant to reflect this (it probably is not a good idea not
 909      to limit number of prefetches at all, as their execution also takes some
 910      time).  */
 911   100,                                  /* number of parallel prefetches */
 912   3,                                    /* Branch cost */
 913   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 914   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 915   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
 916   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 917   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 918   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 919
 920   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
 921   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 922   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 923   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 924   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
 925   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
 926   /* 11-16  */
 927   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
 928   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
 929   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
 930   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
 931   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 932   k8_memcpy,
 933   k8_memset,
 934   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 935   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
 936   "16:8:8",                             /* Loop alignment.  */
 937   "16:8:8",                             /* Jump alignment.  */
 938   "0:0:8",                              /* Label alignment.  */
 939   "16",                                 /* Func alignment.  */
 940 };
 941
 942 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
 943    very small blocks it is better to use loop. For large blocks, libcall can
 944    do nontemporary accesses and beat inline considerably.  */
 945 static stringop_algs amdfam10_memcpy[2] = {
 946   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
 947              {-1, rep_prefix_4_byte, false}}},
 948   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
 949              {-1, libcall, false}}}};
 950 static stringop_algs amdfam10_memset[2] = {
 951   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 952              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 953   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
 954              {-1, libcall, false}}}};
 955 struct processor_costs amdfam10_cost = {
 956   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 957   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 958   COSTS_N_INSNS (1),                    /* variable shift costs */
 959   COSTS_N_INSNS (1),                    /* constant shift costs */
 960   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 961    COSTS_N_INSNS (4),                   /*                               HI */
 962    COSTS_N_INSNS (3),                   /*                               SI */
 963    COSTS_N_INSNS (4),                   /*                               DI */
 964    COSTS_N_INSNS (5)},                  /*                            other */
 965   0,                                    /* cost of multiply per each bit set */
 966   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
 967    COSTS_N_INSNS (35),                  /*                          HI */
 968    COSTS_N_INSNS (51),                  /*                          SI */
 969    COSTS_N_INSNS (83),                  /*                          DI */
 970    COSTS_N_INSNS (83)},                 /*                          other */
 971   COSTS_N_INSNS (1),                    /* cost of movsx */
 972   COSTS_N_INSNS (1),                    /* cost of movzx */
 973   8,                                    /* "large" insn */
 974   9,                                    /* MOVE_RATIO */
 975
 976   /* All move costs are relative to integer->integer move times 2 and thus
 977      they are latency*2. */
 978   4,                                 /* cost for loading QImode using movzbl */
 979   {3, 4, 3},                            /* cost of loading integer registers
 980                                            in QImode, HImode and SImode.
 981                                            Relative to reg-reg move (2).  */
 982   {3, 4, 3},                            /* cost of storing integer registers */
 983   4,                                    /* cost of reg,reg fld/fst */
 984   {4, 4, 12},                           /* cost of loading fp registers
 985                                            in SFmode, DFmode and XFmode */
 986   {6, 6, 8},                            /* cost of storing fp registers
 987                                            in SFmode, DFmode and XFmode */
 988   2,                                    /* cost of moving MMX register */
 989   {3, 3},                               /* cost of loading MMX registers
 990                                            in SImode and DImode */
 991   {4, 4},                               /* cost of storing MMX registers
 992                                            in SImode and DImode */
 993   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 994   {4, 4, 3, 6, 12},                     /* cost of loading SSE registers
 995                                            in 32,64,128,256 and 512-bit */
 996   {4, 4, 3, 7, 12},                     /* cost of unaligned loads.  */
 997   {4, 4, 5, 10, 20},                    /* cost of storing SSE registers
 998                                            in 32,64,128,256 and 512-bit */
 999   {4, 4, 5, 10, 20},                    /* cost of unaligned stores.  */
1000   3, 3,                                 /* SSE->integer and integer->SSE moves */
1001                                         /* On K8:
1002                                             MOVD reg64, xmmreg Double FSTORE 4
1003                                             MOVD reg32, xmmreg Double FSTORE 4
1004                                            On AMDFAM10:
1005                                             MOVD reg64, xmmreg Double FADD 3
1006                                                                1/1  1/1
1007                                             MOVD reg32, xmmreg Double FADD 3
1008                                                                1/1  1/1 */
1009   4, 4,                                 /* Gather load static, per_elt.  */
1010   4, 4,                                 /* Gather store static, per_elt.  */
1011   64,                                   /* size of l1 cache.  */
1012   512,                                  /* size of l2 cache.  */
1013   64,                                   /* size of prefetch block */
1014   /* New AMD processors never drop prefetches; if they cannot be performed
1015      immediately, they are queued.  We set number of simultaneous prefetches
1016      to a large constant to reflect this (it probably is not a good idea not
1017      to limit number of prefetches at all, as their execution also takes some
1018      time).  */
1019   100,                                  /* number of parallel prefetches */
1020   2,                                    /* Branch cost */
1021   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1022   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1023   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1024   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1025   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1026   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1027
1028   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1029   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1030   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1031   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1032   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
1033   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
1034   /* 11-16  */
1035   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
1036   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
1037   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
1038   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
1039   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1040   amdfam10_memcpy,
1041   amdfam10_memset,
1042   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1043   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1044   "32:25:8",                            /* Loop alignment.  */
1045   "32:8:8",                             /* Jump alignment.  */
1046   "0:0:8",                              /* Label alignment.  */
1047   "32",                                 /* Func alignment.  */
1048 };
1049
1050 /*  BDVER1 has optimized REP instruction for medium sized blocks, but for
1051     very small blocks it is better to use loop. For large blocks, libcall
1052     can do nontemporary accesses and beat inline considerably.  */
1053 static stringop_algs bdver1_memcpy[2] = {
1054   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1055              {-1, rep_prefix_4_byte, false}}},
1056   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1057              {-1, libcall, false}}}};
1058 static stringop_algs bdver1_memset[2] = {
1059   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1060              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1061   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1062              {-1, libcall, false}}}};
1063
1064 const struct processor_costs bdver1_cost = {
1065   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1066   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1067   COSTS_N_INSNS (1),                    /* variable shift costs */
1068   COSTS_N_INSNS (1),                    /* constant shift costs */
1069   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1070    COSTS_N_INSNS (4),                   /*                               HI */
1071    COSTS_N_INSNS (4),                   /*                               SI */
1072    COSTS_N_INSNS (6),                   /*                               DI */
1073    COSTS_N_INSNS (6)},                  /*                            other */
1074   0,                                    /* cost of multiply per each bit set */
1075   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1076    COSTS_N_INSNS (35),                  /*                          HI */
1077    COSTS_N_INSNS (51),                  /*                          SI */
1078    COSTS_N_INSNS (83),                  /*                          DI */
1079    COSTS_N_INSNS (83)},                 /*                          other */
1080   COSTS_N_INSNS (1),                    /* cost of movsx */
1081   COSTS_N_INSNS (1),                    /* cost of movzx */
1082   8,                                    /* "large" insn */
1083   9,                                    /* MOVE_RATIO */
1084
1085   /* All move costs are relative to integer->integer move times 2 and thus
1086      they are latency*2. */
1087   8,                                 /* cost for loading QImode using movzbl */
1088   {8, 8, 8},                            /* cost of loading integer registers
1089                                            in QImode, HImode and SImode.
1090                                            Relative to reg-reg move (2).  */
1091   {8, 8, 8},                            /* cost of storing integer registers */
1092   4,                                    /* cost of reg,reg fld/fst */
1093   {12, 12, 28},                         /* cost of loading fp registers
1094                                            in SFmode, DFmode and XFmode */
1095   {10, 10, 18},                         /* cost of storing fp registers
1096                                            in SFmode, DFmode and XFmode */
1097   4,                                    /* cost of moving MMX register */
1098   {12, 12},                             /* cost of loading MMX registers
1099                                            in SImode and DImode */
1100   {10, 10},                             /* cost of storing MMX registers
1101                                            in SImode and DImode */
1102   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1103   {12, 12, 10, 20, 30},                 /* cost of loading SSE registers
1104                                            in 32,64,128,256 and 512-bit */
1105   {12, 12, 10, 20, 30},                 /* cost of unaligned loads.  */
1106   {10, 10, 10, 20, 30},                 /* cost of storing SSE registers
1107                                            in 32,64,128,256 and 512-bit */
1108   {10, 10, 10, 20, 30},                 /* cost of unaligned stores.  */
1109   16, 20,                               /* SSE->integer and integer->SSE moves */
1110   12, 12,                               /* Gather load static, per_elt.  */
1111   10, 10,                               /* Gather store static, per_elt.  */
1112   16,                                   /* size of l1 cache.  */
1113   2048,                                 /* size of l2 cache.  */
1114   64,                                   /* size of prefetch block */
1115   /* New AMD processors never drop prefetches; if they cannot be performed
1116      immediately, they are queued.  We set number of simultaneous prefetches
1117      to a large constant to reflect this (it probably is not a good idea not
1118      to limit number of prefetches at all, as their execution also takes some
1119      time).  */
1120   100,                                  /* number of parallel prefetches */
1121   2,                                    /* Branch cost */
1122   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1123   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1124   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1125   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1126   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1127   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1128
1129   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1130   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1131   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1132   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1133   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1134   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1135   /* 9-24  */
1136   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1137   /* 9-27  */
1138   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1139   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1140   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1141   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1142   bdver1_memcpy,
1143   bdver1_memset,
1144   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1145   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1146   "16:11:8",                            /* Loop alignment.  */
1147   "16:8:8",                             /* Jump alignment.  */
1148   "0:0:8",                              /* Label alignment.  */
1149   "11",                                 /* Func alignment.  */
1150 };
1151
1152 /*  BDVER2 has optimized REP instruction for medium sized blocks, but for
1153     very small blocks it is better to use loop. For large blocks, libcall
1154     can do nontemporary accesses and beat inline considerably.  */
1155
1156 static stringop_algs bdver2_memcpy[2] = {
1157   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1158              {-1, rep_prefix_4_byte, false}}},
1159   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1160              {-1, libcall, false}}}};
1161 static stringop_algs bdver2_memset[2] = {
1162   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1163              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1164   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1165              {-1, libcall, false}}}};
1166
1167 const struct processor_costs bdver2_cost = {
1168   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1169   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1170   COSTS_N_INSNS (1),                    /* variable shift costs */
1171   COSTS_N_INSNS (1),                    /* constant shift costs */
1172   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1173    COSTS_N_INSNS (4),                   /*                               HI */
1174    COSTS_N_INSNS (4),                   /*                               SI */
1175    COSTS_N_INSNS (6),                   /*                               DI */
1176    COSTS_N_INSNS (6)},                  /*                            other */
1177   0,                                    /* cost of multiply per each bit set */
1178   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1179    COSTS_N_INSNS (35),                  /*                          HI */
1180    COSTS_N_INSNS (51),                  /*                          SI */
1181    COSTS_N_INSNS (83),                  /*                          DI */
1182    COSTS_N_INSNS (83)},                 /*                          other */
1183   COSTS_N_INSNS (1),                    /* cost of movsx */
1184   COSTS_N_INSNS (1),                    /* cost of movzx */
1185   8,                                    /* "large" insn */
1186   9,                                    /* MOVE_RATIO */
1187
1188   /* All move costs are relative to integer->integer move times 2 and thus
1189      they are latency*2. */
1190   8,                                 /* cost for loading QImode using movzbl */
1191   {8, 8, 8},                            /* cost of loading integer registers
1192                                            in QImode, HImode and SImode.
1193                                            Relative to reg-reg move (2).  */
1194   {8, 8, 8},                            /* cost of storing integer registers */
1195   4,                                    /* cost of reg,reg fld/fst */
1196   {12, 12, 28},                         /* cost of loading fp registers
1197                                            in SFmode, DFmode and XFmode */
1198   {10, 10, 18},                         /* cost of storing fp registers
1199                                            in SFmode, DFmode and XFmode */
1200   4,                                    /* cost of moving MMX register */
1201   {12, 12},                             /* cost of loading MMX registers
1202                                            in SImode and DImode */
1203   {10, 10},                             /* cost of storing MMX registers
1204                                            in SImode and DImode */
1205   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1206   {12, 12, 10, 20, 30},                 /* cost of loading SSE registers
1207                                            in 32,64,128,256 and 512-bit */
1208   {12, 12, 10, 20, 30},                 /* cost of unaligned loads.  */
1209   {10, 10, 10, 20, 30},                 /* cost of storing SSE registers
1210                                            in 32,64,128,256 and 512-bit */
1211   {10, 10, 10, 20, 30},                 /* cost of unaligned stores.  */
1212   16, 20,                               /* SSE->integer and integer->SSE moves */
1213   12, 12,                               /* Gather load static, per_elt.  */
1214   10, 10,                               /* Gather store static, per_elt.  */
1215   16,                                   /* size of l1 cache.  */
1216   2048,                                 /* size of l2 cache.  */
1217   64,                                   /* size of prefetch block */
1218   /* New AMD processors never drop prefetches; if they cannot be performed
1219      immediately, they are queued.  We set number of simultaneous prefetches
1220      to a large constant to reflect this (it probably is not a good idea not
1221      to limit number of prefetches at all, as their execution also takes some
1222      time).  */
1223   100,                                  /* number of parallel prefetches */
1224   2,                                    /* Branch cost */
1225   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1226   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1227   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1228   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1229   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1230   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1231
1232   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1233   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1234   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1235   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1236   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1237   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1238   /* 9-24  */
1239   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1240   /* 9-27  */
1241   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1242   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1243   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1244   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1245   bdver2_memcpy,
1246   bdver2_memset,
1247   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1248   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1249   "16:11:8",                            /* Loop alignment.  */
1250   "16:8:8",                             /* Jump alignment.  */
1251   "0:0:8",                              /* Label alignment.  */
1252   "11",                                 /* Func alignment.  */
1253 };
1254
1255
1256   /*  BDVER3 has optimized REP instruction for medium sized blocks, but for
1257       very small blocks it is better to use loop. For large blocks, libcall
1258       can do nontemporary accesses and beat inline considerably.  */
1259 static stringop_algs bdver3_memcpy[2] = {
1260   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1261              {-1, rep_prefix_4_byte, false}}},
1262   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1263              {-1, libcall, false}}}};
1264 static stringop_algs bdver3_memset[2] = {
1265   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1266              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1267   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1268              {-1, libcall, false}}}};
1269 struct processor_costs bdver3_cost = {
1270   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1271   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1272   COSTS_N_INSNS (1),                    /* variable shift costs */
1273   COSTS_N_INSNS (1),                    /* constant shift costs */
1274   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1275    COSTS_N_INSNS (4),                   /*                               HI */
1276    COSTS_N_INSNS (4),                   /*                               SI */
1277    COSTS_N_INSNS (6),                   /*                               DI */
1278    COSTS_N_INSNS (6)},                  /*                            other */
1279   0,                                    /* cost of multiply per each bit set */
1280   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1281    COSTS_N_INSNS (35),                  /*                          HI */
1282    COSTS_N_INSNS (51),                  /*                          SI */
1283    COSTS_N_INSNS (83),                  /*                          DI */
1284    COSTS_N_INSNS (83)},                 /*                          other */
1285   COSTS_N_INSNS (1),                    /* cost of movsx */
1286   COSTS_N_INSNS (1),                    /* cost of movzx */
1287   8,                                    /* "large" insn */
1288   9,                                    /* MOVE_RATIO */
1289
1290   /* All move costs are relative to integer->integer move times 2 and thus
1291      they are latency*2. */
1292   8,                                 /* cost for loading QImode using movzbl */
1293   {8, 8, 8},                            /* cost of loading integer registers
1294                                            in QImode, HImode and SImode.
1295                                            Relative to reg-reg move (2).  */
1296   {8, 8, 8},                            /* cost of storing integer registers */
1297   4,                                    /* cost of reg,reg fld/fst */
1298   {12, 12, 28},                         /* cost of loading fp registers
1299                                            in SFmode, DFmode and XFmode */
1300   {10, 10, 18},                         /* cost of storing fp registers
1301                                            in SFmode, DFmode and XFmode */
1302   4,                                    /* cost of moving MMX register */
1303   {12, 12},                             /* cost of loading MMX registers
1304                                            in SImode and DImode */
1305   {10, 10},                             /* cost of storing MMX registers
1306                                            in SImode and DImode */
1307   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1308   {12, 12, 10, 20, 30},                 /* cost of loading SSE registers
1309                                            in 32,64,128,256 and 512-bit */
1310   {12, 12, 10, 20, 30},                 /* cost of unaligned loads.  */
1311   {10, 10, 10, 20, 30},                 /* cost of storing SSE registers
1312                                            in 32,64,128,256 and 512-bit */
1313   {10, 10, 10, 20, 30},                 /* cost of unaligned stores.  */
1314   16, 20,                               /* SSE->integer and integer->SSE moves */
1315   12, 12,                               /* Gather load static, per_elt.  */
1316   10, 10,                               /* Gather store static, per_elt.  */
1317   16,                                   /* size of l1 cache.  */
1318   2048,                                 /* size of l2 cache.  */
1319   64,                                   /* size of prefetch block */
1320   /* New AMD processors never drop prefetches; if they cannot be performed
1321      immediately, they are queued.  We set number of simultaneous prefetches
1322      to a large constant to reflect this (it probably is not a good idea not
1323      to limit number of prefetches at all, as their execution also takes some
1324      time).  */
1325   100,                                  /* number of parallel prefetches */
1326   2,                                    /* Branch cost */
1327   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1328   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1329   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1330   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1331   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1332   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1333
1334   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1335   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1336   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1337   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1338   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1339   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1340   /* 9-24  */
1341   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1342   /* 9-27  */
1343   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1344   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1345   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1346   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1347   bdver3_memcpy,
1348   bdver3_memset,
1349   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1350   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1351   "16:11:8",                            /* Loop alignment.  */
1352   "16:8:8",                             /* Jump alignment.  */
1353   "0:0:8",                              /* Label alignment.  */
1354   "11",                                 /* Func alignment.  */
1355 };
1356
1357 /*  BDVER4 has optimized REP instruction for medium sized blocks, but for
1358     very small blocks it is better to use loop. For large blocks, libcall
1359     can do nontemporary accesses and beat inline considerably.  */
1360 static stringop_algs bdver4_memcpy[2] = {
1361   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1362              {-1, rep_prefix_4_byte, false}}},
1363   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1364              {-1, libcall, false}}}};
1365 static stringop_algs bdver4_memset[2] = {
1366   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1367              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1368   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1369              {-1, libcall, false}}}};
1370 struct processor_costs bdver4_cost = {
1371   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1372   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1373   COSTS_N_INSNS (1),                    /* variable shift costs */
1374   COSTS_N_INSNS (1),                    /* constant shift costs */
1375   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1376    COSTS_N_INSNS (4),                   /*                               HI */
1377    COSTS_N_INSNS (4),                   /*                               SI */
1378    COSTS_N_INSNS (6),                   /*                               DI */
1379    COSTS_N_INSNS (6)},                  /*                            other */
1380   0,                                    /* cost of multiply per each bit set */
1381   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1382    COSTS_N_INSNS (35),                  /*                          HI */
1383    COSTS_N_INSNS (51),                  /*                          SI */
1384    COSTS_N_INSNS (83),                  /*                          DI */
1385    COSTS_N_INSNS (83)},                 /*                          other */
1386   COSTS_N_INSNS (1),                    /* cost of movsx */
1387   COSTS_N_INSNS (1),                    /* cost of movzx */
1388   8,                                    /* "large" insn */
1389   9,                                    /* MOVE_RATIO */
1390
1391   /* All move costs are relative to integer->integer move times 2 and thus
1392      they are latency*2. */
1393   8,                                 /* cost for loading QImode using movzbl */
1394   {8, 8, 8},                            /* cost of loading integer registers
1395                                            in QImode, HImode and SImode.
1396                                            Relative to reg-reg move (2).  */
1397   {8, 8, 8},                            /* cost of storing integer registers */
1398   4,                                    /* cost of reg,reg fld/fst */
1399   {12, 12, 28},                         /* cost of loading fp registers
1400                                            in SFmode, DFmode and XFmode */
1401   {10, 10, 18},                         /* cost of storing fp registers
1402                                            in SFmode, DFmode and XFmode */
1403   4,                                    /* cost of moving MMX register */
1404   {12, 12},                             /* cost of loading MMX registers
1405                                            in SImode and DImode */
1406   {10, 10},                             /* cost of storing MMX registers
1407                                            in SImode and DImode */
1408   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1409   {12, 12, 10, 20, 30},                 /* cost of loading SSE registers
1410                                            in 32,64,128,256 and 512-bit */
1411   {12, 12, 10, 20, 30},                 /* cost of unaligned loads.  */
1412   {10, 10, 10, 20, 30},                 /* cost of storing SSE registers
1413                                            in 32,64,128,256 and 512-bit */
1414   {10, 10, 10, 20, 30},                 /* cost of unaligned stores.  */
1415   16, 20,                               /* SSE->integer and integer->SSE moves */
1416   12, 12,                               /* Gather load static, per_elt.  */
1417   10, 10,                               /* Gather store static, per_elt.  */
1418   16,                                   /* size of l1 cache.  */
1419   2048,                                 /* size of l2 cache.  */
1420   64,                                   /* size of prefetch block */
1421   /* New AMD processors never drop prefetches; if they cannot be performed
1422      immediately, they are queued.  We set number of simultaneous prefetches
1423      to a large constant to reflect this (it probably is not a good idea not
1424      to limit number of prefetches at all, as their execution also takes some
1425      time).  */
1426   100,                                  /* number of parallel prefetches */
1427   2,                                    /* Branch cost */
1428   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1429   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1430   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1431   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1432   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1433   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1434
1435   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1436   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1437   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1438   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1439   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1440   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1441   /* 9-24  */
1442   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1443   /* 9-27  */
1444   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1445   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1446   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1447   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1448   bdver4_memcpy,
1449   bdver4_memset,
1450   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1451   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1452   "16:11:8",                            /* Loop alignment.  */
1453   "16:8:8",                             /* Jump alignment.  */
1454   "0:0:8",                              /* Label alignment.  */
1455   "11",                                 /* Func alignment.  */
1456 };
1457
1458
1459 /*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
1460     very small blocks it is better to use loop.  For large blocks, libcall
1461     can do nontemporary accesses and beat inline considerably.  */
1462 static stringop_algs znver1_memcpy[2] = {
1463   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1464              {-1, rep_prefix_4_byte, false}}},
1465   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1466              {-1, libcall, false}}}};
1467 static stringop_algs znver1_memset[2] = {
1468   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1469              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1470   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1471              {-1, libcall, false}}}};
1472 struct processor_costs znver1_cost = {
1473   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1474   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1475   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1476   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1477   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1478    COSTS_N_INSNS (3),                   /*                               HI.  */
1479    COSTS_N_INSNS (3),                   /*                               SI.  */
1480    COSTS_N_INSNS (3),                   /*                               DI.  */
1481    COSTS_N_INSNS (3)},                  /*                            other.  */
1482   0,                                    /* cost of multiply per each bit
1483                                             set.  */
1484    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1485       bound.  */
1486   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI.  */
1487    COSTS_N_INSNS (22),                  /*                          HI.  */
1488    COSTS_N_INSNS (30),                  /*                          SI.  */
1489    COSTS_N_INSNS (45),                  /*                          DI.  */
1490    COSTS_N_INSNS (45)},                 /*                          other.  */
1491   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1492   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1493   8,                                    /* "large" insn.  */
1494   9,                                    /* MOVE_RATIO.  */
1495
1496   /* All move costs are relative to integer->integer move times 2 and thus
1497      they are latency*2. */
1498
1499   /* reg-reg moves are done by renaming and thus they are even cheaper than
1500      1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1501      to doubles of latencies, we do not model this correctly.  It does not
1502      seem to make practical difference to bump prices up even more.  */
1503   6,                                    /* cost for loading QImode using
1504                                            movzbl.  */
1505   {6, 6, 6},                            /* cost of loading integer registers
1506                                            in QImode, HImode and SImode.
1507                                            Relative to reg-reg move (2).  */
1508   {8, 8, 8},                            /* cost of storing integer
1509                                            registers.  */
1510   2,                                    /* cost of reg,reg fld/fst.  */
1511   {6, 6, 16},                           /* cost of loading fp registers
1512                                            in SFmode, DFmode and XFmode.  */
1513   {8, 8, 16},                           /* cost of storing fp registers
1514                                            in SFmode, DFmode and XFmode.  */
1515   2,                                    /* cost of moving MMX register.  */
1516   {6, 6},                               /* cost of loading MMX registers
1517                                            in SImode and DImode.  */
1518   {8, 8},                               /* cost of storing MMX registers
1519                                            in SImode and DImode.  */
1520   2, 3, 6,                              /* cost of moving XMM,YMM,ZMM register.  */
1521   {6, 6, 6, 10, 20},                    /* cost of loading SSE registers
1522                                            in 32,64,128,256 and 512-bit.  */
1523   {6, 6, 6, 10, 20},                    /* cost of unaligned loads.  */
1524   {8, 8, 8, 8, 16},                     /* cost of storing SSE registers
1525                                            in 32,64,128,256 and 512-bit.  */
1526   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
1527   6, 6,                                 /* SSE->integer and integer->SSE moves.  */
1528   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1529      throughput 12.  Approx 9 uops do not depend on vector size and every load
1530      is 7 uops.  */
1531   18, 8,                                /* Gather load static, per_elt.  */
1532   18, 10,                               /* Gather store static, per_elt.  */
1533   32,                                   /* size of l1 cache.  */
1534   512,                                  /* size of l2 cache.  */
1535   64,                                   /* size of prefetch block.  */
1536   /* New AMD processors never drop prefetches; if they cannot be performed
1537      immediately, they are queued.  We set number of simultaneous prefetches
1538      to a large constant to reflect this (it probably is not a good idea not
1539      to limit number of prefetches at all, as their execution also takes some
1540      time).  */
1541   100,                                  /* number of parallel prefetches.  */
1542   3,                                    /* Branch cost.  */
1543   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1544   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1545   /* Latency of fdiv is 8-15.  */
1546   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1547   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1548   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1549   /* Latency of fsqrt is 4-10.  */
1550   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1551
1552   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1553   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1554   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1555   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1556   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1557   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1558   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1559   /* 9-13  */
1560   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1561   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1562   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1563   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1564      and it can execute 2 integer additions and 2 multiplications thus
1565      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
1566      that 4 works better than 6 probably due to register pressure.
1567
1568      Integer vector operations are taken by FP unit and execute 3 vector
1569      plus/minus operations per cycle but only one multiply.  This is adjusted
1570      in ix86_reassociation_width.  */
1571   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1572   znver1_memcpy,
1573   znver1_memset,
1574   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1575   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1576   "16",                                 /* Loop alignment.  */
1577   "16",                                 /* Jump alignment.  */
1578   "0:0:8",                              /* Label alignment.  */
1579   "16",                                 /* Func alignment.  */
1580 };
1581
1582 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
1583 static stringop_algs skylake_memcpy[2] =   {
1584   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1585   {libcall, {{16, loop, false}, {512, unrolled_loop, false},
1586              {-1, libcall, false}}}};
1587
1588 static stringop_algs skylake_memset[2] = {
1589   {libcall, {{6, loop_1_byte, true},
1590              {24, loop, true},
1591              {8192, rep_prefix_4_byte, true},
1592              {-1, libcall, false}}},
1593   {libcall, {{24, loop, true}, {512, unrolled_loop, false},
1594              {-1, libcall, false}}}};
1595
1596 static const
1597 struct processor_costs skylake_cost = {
1598   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1599   COSTS_N_INSNS (1)+1,          /* cost of a lea instruction */
1600   COSTS_N_INSNS (1),                    /* variable shift costs */
1601   COSTS_N_INSNS (1),                    /* constant shift costs */
1602   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1603    COSTS_N_INSNS (4),                   /*                               HI */
1604    COSTS_N_INSNS (3),                   /*                               SI */
1605    COSTS_N_INSNS (3),                   /*                               DI */
1606    COSTS_N_INSNS (3)},                  /*                            other */
1607   0,                                    /* cost of multiply per each bit set */
1608   /* Expanding div/mod currently doesn't consider parallelism. So the cost
1609      model is not realistic. We compensate by increasing the latencies a bit.  */
1610   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
1611    COSTS_N_INSNS (11),                  /*                          HI */
1612    COSTS_N_INSNS (14),                  /*                          SI */
1613    COSTS_N_INSNS (76),                  /*                          DI */
1614    COSTS_N_INSNS (76)},                 /*                          other */
1615   COSTS_N_INSNS (1),                    /* cost of movsx */
1616   COSTS_N_INSNS (0),                    /* cost of movzx */
1617   8,                                    /* "large" insn */
1618   17,                                   /* MOVE_RATIO */
1619
1620   6,                                 /* cost for loading QImode using movzbl */
1621   {4, 4, 4},                            /* cost of loading integer registers
1622                                            in QImode, HImode and SImode.
1623                                            Relative to reg-reg move (2).  */
1624   {6, 6, 3},                            /* cost of storing integer registers */
1625   2,                                    /* cost of reg,reg fld/fst */
1626   {6, 6, 8},                            /* cost of loading fp registers
1627                                            in SFmode, DFmode and XFmode */
1628   {6, 6, 10},                           /* cost of storing fp registers
1629                                            in SFmode, DFmode and XFmode */
1630   2,                                    /* cost of moving MMX register */
1631   {6, 6},                               /* cost of loading MMX registers
1632                                            in SImode and DImode */
1633   {6, 6},                               /* cost of storing MMX registers
1634                                            in SImode and DImode */
1635   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
1636   {6, 6, 6, 10, 20},                    /* cost of loading SSE registers
1637                                            in 32,64,128,256 and 512-bit */
1638   {6, 6, 6, 10, 20},                    /* cost of unaligned loads.  */
1639   {8, 8, 8, 12, 24},                    /* cost of storing SSE registers
1640                                            in 32,64,128,256 and 512-bit */
1641   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
1642   2, 2,                                 /* SSE->integer and integer->SSE moves */
1643   20, 8,                                /* Gather load static, per_elt.  */
1644   22, 10,                               /* Gather store static, per_elt.  */
1645   64,                                   /* size of l1 cache.  */
1646   512,                                  /* size of l2 cache.  */
1647   64,                                   /* size of prefetch block */
1648   6,                                    /* number of parallel prefetches */
1649   3,                                    /* Branch cost */
1650   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
1651   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1652   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
1653   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1654   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1655   COSTS_N_INSNS (20),                   /* cost of FSQRT instruction.  */
1656
1657   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1658   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1659   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1660   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1661   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
1662   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
1663   COSTS_N_INSNS (11),                   /* cost of DIVSS instruction.  */
1664   COSTS_N_INSNS (14),                   /* cost of DIVSD instruction.  */
1665   COSTS_N_INSNS (12),                   /* cost of SQRTSS instruction.  */
1666   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
1667   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
1668   skylake_memcpy,
1669   skylake_memset,
1670   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1671   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1672   "16:11:8",                            /* Loop alignment.  */
1673   "16:11:8",                            /* Jump alignment.  */
1674   "0:0:8",                              /* Label alignment.  */
1675   "16",                                 /* Func alignment.  */
1676 };
1677   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1678      very small blocks it is better to use loop. For large blocks, libcall can
1679      do nontemporary accesses and beat inline considerably.  */
1680 static stringop_algs btver1_memcpy[2] = {
1681   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1682              {-1, rep_prefix_4_byte, false}}},
1683   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1684              {-1, libcall, false}}}};
1685 static stringop_algs btver1_memset[2] = {
1686   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1687              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1688   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1689              {-1, libcall, false}}}};
1690 const struct processor_costs btver1_cost = {
1691   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1692   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1693   COSTS_N_INSNS (1),                    /* variable shift costs */
1694   COSTS_N_INSNS (1),                    /* constant shift costs */
1695   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1696    COSTS_N_INSNS (4),                   /*                               HI */
1697    COSTS_N_INSNS (3),                   /*                               SI */
1698    COSTS_N_INSNS (4),                   /*                               DI */
1699    COSTS_N_INSNS (5)},                  /*                            other */
1700   0,                                    /* cost of multiply per each bit set */
1701   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1702    COSTS_N_INSNS (35),                  /*                          HI */
1703    COSTS_N_INSNS (51),                  /*                          SI */
1704    COSTS_N_INSNS (83),                  /*                          DI */
1705    COSTS_N_INSNS (83)},                 /*                          other */
1706   COSTS_N_INSNS (1),                    /* cost of movsx */
1707   COSTS_N_INSNS (1),                    /* cost of movzx */
1708   8,                                    /* "large" insn */
1709   9,                                    /* MOVE_RATIO */
1710
1711   /* All move costs are relative to integer->integer move times 2 and thus
1712      they are latency*2. */
1713   8,                                 /* cost for loading QImode using movzbl */
1714   {6, 8, 6},                            /* cost of loading integer registers
1715                                            in QImode, HImode and SImode.
1716                                            Relative to reg-reg move (2).  */
1717   {6, 8, 6},                            /* cost of storing integer registers */
1718   4,                                    /* cost of reg,reg fld/fst */
1719   {12, 12, 28},                         /* cost of loading fp registers
1720                                            in SFmode, DFmode and XFmode */
1721   {12, 12, 38},                         /* cost of storing fp registers
1722                                            in SFmode, DFmode and XFmode */
1723   4,                                    /* cost of moving MMX register */
1724   {10, 10},                             /* cost of loading MMX registers
1725                                            in SImode and DImode */
1726   {12, 12},                             /* cost of storing MMX registers
1727                                            in SImode and DImode */
1728   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1729   {10, 10, 12, 24, 48},                 /* cost of loading SSE registers
1730                                            in 32,64,128,256 and 512-bit */
1731   {10, 10, 12, 24, 48},                 /* cost of unaligned loads.  */
1732   {10, 10, 12, 24, 48},                 /* cost of storing SSE registers
1733                                            in 32,64,128,256 and 512-bit */
1734   {10, 10, 12, 24, 48},                 /* cost of unaligned stores.  */
1735   14, 14,                               /* SSE->integer and integer->SSE moves */
1736   10, 10,                               /* Gather load static, per_elt.  */
1737   10, 10,                               /* Gather store static, per_elt.  */
1738   32,                                   /* size of l1 cache.  */
1739   512,                                  /* size of l2 cache.  */
1740   64,                                   /* size of prefetch block */
1741   100,                                  /* number of parallel prefetches */
1742   2,                                    /* Branch cost */
1743   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1744   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1745   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1746   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1747   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1748   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1749
1750   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1751   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1752   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
1753   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1754   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1755   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1756   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
1757   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
1758   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
1759   COSTS_N_INSNS (48),                   /* cost of SQRTSD instruction.  */
1760   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1761   btver1_memcpy,
1762   btver1_memset,
1763   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1764   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1765   "16:11:8",                            /* Loop alignment.  */
1766   "16:8:8",                             /* Jump alignment.  */
1767   "0:0:8",                              /* Label alignment.  */
1768   "11",                                 /* Func alignment.  */
1769 };
1770
1771 static stringop_algs btver2_memcpy[2] = {
1772   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1773              {-1, rep_prefix_4_byte, false}}},
1774   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1775              {-1, libcall, false}}}};
1776 static stringop_algs btver2_memset[2] = {
1777   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1778              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1779   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1780              {-1, libcall, false}}}};
1781 const struct processor_costs btver2_cost = {
1782   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1783   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1784   COSTS_N_INSNS (1),                    /* variable shift costs */
1785   COSTS_N_INSNS (1),                    /* constant shift costs */
1786   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1787    COSTS_N_INSNS (4),                   /*                               HI */
1788    COSTS_N_INSNS (3),                   /*                               SI */
1789    COSTS_N_INSNS (4),                   /*                               DI */
1790    COSTS_N_INSNS (5)},                  /*                            other */
1791   0,                                    /* cost of multiply per each bit set */
1792   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1793    COSTS_N_INSNS (35),                  /*                          HI */
1794    COSTS_N_INSNS (51),                  /*                          SI */
1795    COSTS_N_INSNS (83),                  /*                          DI */
1796    COSTS_N_INSNS (83)},                 /*                          other */
1797   COSTS_N_INSNS (1),                    /* cost of movsx */
1798   COSTS_N_INSNS (1),                    /* cost of movzx */
1799   8,                                    /* "large" insn */
1800   9,                                    /* MOVE_RATIO */
1801
1802   /* All move costs are relative to integer->integer move times 2 and thus
1803      they are latency*2. */
1804   8,                                 /* cost for loading QImode using movzbl */
1805   {8, 8, 6},                            /* cost of loading integer registers
1806                                            in QImode, HImode and SImode.
1807                                            Relative to reg-reg move (2).  */
1808   {8, 8, 6},                            /* cost of storing integer registers */
1809   4,                                    /* cost of reg,reg fld/fst */
1810   {12, 12, 28},                         /* cost of loading fp registers
1811                                            in SFmode, DFmode and XFmode */
1812   {12, 12, 38},                         /* cost of storing fp registers
1813                                            in SFmode, DFmode and XFmode */
1814   4,                                    /* cost of moving MMX register */
1815   {10, 10},                             /* cost of loading MMX registers
1816                                            in SImode and DImode */
1817   {12, 12},                             /* cost of storing MMX registers
1818                                            in SImode and DImode */
1819   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1820   {10, 10, 12, 24, 48},                 /* cost of loading SSE registers
1821                                            in 32,64,128,256 and 512-bit */
1822   {10, 10, 12, 24, 48},                 /* cost of unaligned loads.  */
1823   {10, 10, 12, 24, 48},                 /* cost of storing SSE registers
1824                                            in 32,64,128,256 and 512-bit */
1825   {10, 10, 12, 24, 48},                 /* cost of unaligned stores.  */
1826   14, 14,                               /* SSE->integer and integer->SSE moves */
1827   10, 10,                               /* Gather load static, per_elt.  */
1828   10, 10,                               /* Gather store static, per_elt.  */
1829   32,                                   /* size of l1 cache.  */
1830   2048,                                 /* size of l2 cache.  */
1831   64,                                   /* size of prefetch block */
1832   100,                                  /* number of parallel prefetches */
1833   2,                                    /* Branch cost */
1834   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1835   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1836   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1837   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1838   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1839   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1840
1841   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1842   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1843   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
1844   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1845   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1846   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1847   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
1848   COSTS_N_INSNS (19),                   /* cost of DIVSD instruction.  */
1849   COSTS_N_INSNS (16),                   /* cost of SQRTSS instruction.  */
1850   COSTS_N_INSNS (21),                   /* cost of SQRTSD instruction.  */
1851   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1852   btver2_memcpy,
1853   btver2_memset,
1854   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1855   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1856   "16:11:8",                            /* Loop alignment.  */
1857   "16:8:8",                             /* Jump alignment.  */
1858   "0:0:8",                              /* Label alignment.  */
1859   "11",                                 /* Func alignment.  */
1860 };
1861
1862 static stringop_algs pentium4_memcpy[2] = {
1863   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1864   DUMMY_STRINGOP_ALGS};
1865 static stringop_algs pentium4_memset[2] = {
1866   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1867              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1868   DUMMY_STRINGOP_ALGS};
1869
1870 static const
1871 struct processor_costs pentium4_cost = {
1872   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1873   COSTS_N_INSNS (3),                    /* cost of a lea instruction */
1874   COSTS_N_INSNS (4),                    /* variable shift costs */
1875   COSTS_N_INSNS (4),                    /* constant shift costs */
1876   {COSTS_N_INSNS (15),                  /* cost of starting multiply for QI */
1877    COSTS_N_INSNS (15),                  /*                               HI */
1878    COSTS_N_INSNS (15),                  /*                               SI */
1879    COSTS_N_INSNS (15),                  /*                               DI */
1880    COSTS_N_INSNS (15)},                 /*                            other */
1881   0,                                    /* cost of multiply per each bit set */
1882   {COSTS_N_INSNS (56),                  /* cost of a divide/mod for QI */
1883    COSTS_N_INSNS (56),                  /*                          HI */
1884    COSTS_N_INSNS (56),                  /*                          SI */
1885    COSTS_N_INSNS (56),                  /*                          DI */
1886    COSTS_N_INSNS (56)},                 /*                          other */
1887   COSTS_N_INSNS (1),                    /* cost of movsx */
1888   COSTS_N_INSNS (1),                    /* cost of movzx */
1889   16,                                   /* "large" insn */
1890   6,                                    /* MOVE_RATIO */
1891
1892   /* All move costs are relative to integer->integer move times 2 and thus
1893      they are latency*2. */
1894   5,                                 /* cost for loading QImode using movzbl */
1895   {4, 5, 4},                            /* cost of loading integer registers
1896                                            in QImode, HImode and SImode.
1897                                            Relative to reg-reg move (2).  */
1898   {2, 3, 2},                            /* cost of storing integer registers */
1899   12,                                   /* cost of reg,reg fld/fst */
1900   {14, 14, 14},                         /* cost of loading fp registers
1901                                            in SFmode, DFmode and XFmode */
1902   {14, 14, 14},                         /* cost of storing fp registers
1903                                            in SFmode, DFmode and XFmode */
1904   12,                                   /* cost of moving MMX register */
1905   {16, 16},                             /* cost of loading MMX registers
1906                                            in SImode and DImode */
1907   {16, 16},                             /* cost of storing MMX registers
1908                                            in SImode and DImode */
1909   12, 24, 48,                           /* cost of moving XMM,YMM,ZMM register */
1910   {16, 16, 16, 32, 64},                 /* cost of loading SSE registers
1911                                            in 32,64,128,256 and 512-bit */
1912   {32, 32, 32, 64, 128},                /* cost of unaligned loads.  */
1913   {16, 16, 16, 32, 64},                 /* cost of storing SSE registers
1914                                            in 32,64,128,256 and 512-bit */
1915   {32, 32, 32, 64, 128},                /* cost of unaligned stores.  */
1916   20, 12,                               /* SSE->integer and integer->SSE moves */
1917   16, 16,                               /* Gather load static, per_elt.  */
1918   16, 16,                               /* Gather store static, per_elt.  */
1919   8,                                    /* size of l1 cache.  */
1920   256,                                  /* size of l2 cache.  */
1921   64,                                   /* size of prefetch block */
1922   6,                                    /* number of parallel prefetches */
1923   2,                                    /* Branch cost */
1924   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1925   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
1926   COSTS_N_INSNS (43),                   /* cost of FDIV instruction.  */
1927   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1928   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1929   COSTS_N_INSNS (43),                   /* cost of FSQRT instruction.  */
1930
1931   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1932   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1933   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1934   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1935   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1936   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1937   COSTS_N_INSNS (23),                   /* cost of DIVSS instruction.  */
1938   COSTS_N_INSNS (38),                   /* cost of DIVSD instruction.  */
1939   COSTS_N_INSNS (23),                   /* cost of SQRTSS instruction.  */
1940   COSTS_N_INSNS (38),                   /* cost of SQRTSD instruction.  */
1941   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1942   pentium4_memcpy,
1943   pentium4_memset,
1944   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1945   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1946   NULL,                                 /* Loop alignment.  */
1947   NULL,                                 /* Jump alignment.  */
1948   NULL,                                 /* Label alignment.  */
1949   NULL,                                 /* Func alignment.  */
1950 };
1951
1952 static stringop_algs nocona_memcpy[2] = {
1953   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1954   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1955              {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1956
1957 static stringop_algs nocona_memset[2] = {
1958   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1959              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1960   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1961              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1962
1963 static const
1964 struct processor_costs nocona_cost = {
1965   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1966   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1967   COSTS_N_INSNS (1),                    /* variable shift costs */
1968   COSTS_N_INSNS (1),                    /* constant shift costs */
1969   {COSTS_N_INSNS (10),                  /* cost of starting multiply for QI */
1970    COSTS_N_INSNS (10),                  /*                               HI */
1971    COSTS_N_INSNS (10),                  /*                               SI */
1972    COSTS_N_INSNS (10),                  /*                               DI */
1973    COSTS_N_INSNS (10)},                 /*                            other */
1974   0,                                    /* cost of multiply per each bit set */
1975   {COSTS_N_INSNS (66),                  /* cost of a divide/mod for QI */
1976    COSTS_N_INSNS (66),                  /*                          HI */
1977    COSTS_N_INSNS (66),                  /*                          SI */
1978    COSTS_N_INSNS (66),                  /*                          DI */
1979    COSTS_N_INSNS (66)},                 /*                          other */
1980   COSTS_N_INSNS (1),                    /* cost of movsx */
1981   COSTS_N_INSNS (1),                    /* cost of movzx */
1982   16,                                   /* "large" insn */
1983   17,                                   /* MOVE_RATIO */
1984
1985   /* All move costs are relative to integer->integer move times 2 and thus
1986      they are latency*2. */
1987   4,                                 /* cost for loading QImode using movzbl */
1988   {4, 4, 4},                            /* cost of loading integer registers
1989                                            in QImode, HImode and SImode.
1990                                            Relative to reg-reg move (2).  */
1991   {4, 4, 4},                            /* cost of storing integer registers */
1992   12,                                   /* cost of reg,reg fld/fst */
1993   {14, 14, 14},                         /* cost of loading fp registers
1994                                            in SFmode, DFmode and XFmode */
1995   {14, 14, 14},                         /* cost of storing fp registers
1996                                            in SFmode, DFmode and XFmode */
1997   14,                                   /* cost of moving MMX register */
1998   {12, 12},                             /* cost of loading MMX registers
1999                                            in SImode and DImode */
2000   {12, 12},                             /* cost of storing MMX registers
2001                                            in SImode and DImode */
2002   6, 12, 24,                            /* cost of moving XMM,YMM,ZMM register */
2003   {12, 12, 12, 24, 48},                 /* cost of loading SSE registers
2004                                            in 32,64,128,256 and 512-bit */
2005   {24, 24, 24, 48, 96},                 /* cost of unaligned loads.  */
2006   {12, 12, 12, 24, 48},                 /* cost of storing SSE registers
2007                                            in 32,64,128,256 and 512-bit */
2008   {24, 24, 24, 48, 96},                 /* cost of unaligned stores.  */
2009   20, 12,                               /* SSE->integer and integer->SSE moves */
2010   12, 12,                               /* Gather load static, per_elt.  */
2011   12, 12,                               /* Gather store static, per_elt.  */
2012   8,                                    /* size of l1 cache.  */
2013   1024,                                 /* size of l2 cache.  */
2014   64,                                   /* size of prefetch block */
2015   8,                                    /* number of parallel prefetches */
2016   1,                                    /* Branch cost */
2017   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
2018   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2019   COSTS_N_INSNS (40),                   /* cost of FDIV instruction.  */
2020   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
2021   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
2022   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
2023
2024   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
2025   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2026   COSTS_N_INSNS (7),                    /* cost of MULSS instruction.  */
2027   COSTS_N_INSNS (7),                    /* cost of MULSD instruction.  */
2028   COSTS_N_INSNS (7),                    /* cost of FMA SS instruction.  */
2029   COSTS_N_INSNS (7),                    /* cost of FMA SD instruction.  */
2030   COSTS_N_INSNS (32),                   /* cost of DIVSS instruction.  */
2031   COSTS_N_INSNS (40),                   /* cost of DIVSD instruction.  */
2032   COSTS_N_INSNS (32),                   /* cost of SQRTSS instruction.  */
2033   COSTS_N_INSNS (41),                   /* cost of SQRTSD instruction.  */
2034   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2035   nocona_memcpy,
2036   nocona_memset,
2037   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2038   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2039   NULL,                                 /* Loop alignment.  */
2040   NULL,                                 /* Jump alignment.  */
2041   NULL,                                 /* Label alignment.  */
2042   NULL,                                 /* Func alignment.  */
2043 };
2044
2045 static stringop_algs atom_memcpy[2] = {
2046   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2047   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2048              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2049 static stringop_algs atom_memset[2] = {
2050   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2051              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2052   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2053              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2054 static const
2055 struct processor_costs atom_cost = {
2056   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2057   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2058   COSTS_N_INSNS (1),                    /* variable shift costs */
2059   COSTS_N_INSNS (1),                    /* constant shift costs */
2060   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2061    COSTS_N_INSNS (4),                   /*                               HI */
2062    COSTS_N_INSNS (3),                   /*                               SI */
2063    COSTS_N_INSNS (4),                   /*                               DI */
2064    COSTS_N_INSNS (2)},                  /*                            other */
2065   0,                                    /* cost of multiply per each bit set */
2066   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
2067    COSTS_N_INSNS (26),                  /*                          HI */
2068    COSTS_N_INSNS (42),                  /*                          SI */
2069    COSTS_N_INSNS (74),                  /*                          DI */
2070    COSTS_N_INSNS (74)},                 /*                          other */
2071   COSTS_N_INSNS (1),                    /* cost of movsx */
2072   COSTS_N_INSNS (1),                    /* cost of movzx */
2073   8,                                    /* "large" insn */
2074   17,                                   /* MOVE_RATIO */
2075
2076   /* All move costs are relative to integer->integer move times 2 and thus
2077      they are latency*2. */
2078   6,                                    /* cost for loading QImode using movzbl */
2079   {6, 6, 6},                            /* cost of loading integer registers
2080                                            in QImode, HImode and SImode.
2081                                            Relative to reg-reg move (2).  */
2082   {6, 6, 6},                            /* cost of storing integer registers */
2083   4,                                    /* cost of reg,reg fld/fst */
2084   {6, 6, 18},                           /* cost of loading fp registers
2085                                            in SFmode, DFmode and XFmode */
2086   {14, 14, 24},                         /* cost of storing fp registers
2087                                            in SFmode, DFmode and XFmode */
2088   2,                                    /* cost of moving MMX register */
2089   {8, 8},                               /* cost of loading MMX registers
2090                                            in SImode and DImode */
2091   {10, 10},                             /* cost of storing MMX registers
2092                                            in SImode and DImode */
2093   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2094   {8, 8, 8, 16, 32},                    /* cost of loading SSE registers
2095                                            in 32,64,128,256 and 512-bit */
2096   {16, 16, 16, 32, 64},                 /* cost of unaligned loads.  */
2097   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
2098                                            in 32,64,128,256 and 512-bit */
2099   {16, 16, 16, 32, 64},                 /* cost of unaligned stores.  */
2100   8, 6,                                 /* SSE->integer and integer->SSE moves */
2101   8, 8,                                 /* Gather load static, per_elt.  */
2102   8, 8,                                 /* Gather store static, per_elt.  */
2103   32,                                   /* size of l1 cache.  */
2104   256,                                  /* size of l2 cache.  */
2105   64,                                   /* size of prefetch block */
2106   6,                                    /* number of parallel prefetches */
2107   3,                                    /* Branch cost */
2108   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2109   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2110   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2111   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2112   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2113   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2114
2115   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2116   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2117   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2118   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2119   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2120   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2121   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
2122   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
2123   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
2124   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
2125   2, 2, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
2126   atom_memcpy,
2127   atom_memset,
2128   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2129   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2130   "16",                                 /* Loop alignment.  */
2131   "16:8:8",                             /* Jump alignment.  */
2132   "0:0:8",                              /* Label alignment.  */
2133   "16",                                 /* Func alignment.  */
2134 };
2135
2136 static stringop_algs slm_memcpy[2] = {
2137   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2138   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2139              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2140 static stringop_algs slm_memset[2] = {
2141   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2142              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2143   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2144              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2145 static const
2146 struct processor_costs slm_cost = {
2147   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2148   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2149   COSTS_N_INSNS (1),                    /* variable shift costs */
2150   COSTS_N_INSNS (1),                    /* constant shift costs */
2151   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2152    COSTS_N_INSNS (3),                   /*                               HI */
2153    COSTS_N_INSNS (3),                   /*                               SI */
2154    COSTS_N_INSNS (4),                   /*                               DI */
2155    COSTS_N_INSNS (2)},                  /*                            other */
2156   0,                                    /* cost of multiply per each bit set */
2157   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
2158    COSTS_N_INSNS (26),                  /*                          HI */
2159    COSTS_N_INSNS (42),                  /*                          SI */
2160    COSTS_N_INSNS (74),                  /*                          DI */
2161    COSTS_N_INSNS (74)},                 /*                          other */
2162   COSTS_N_INSNS (1),                    /* cost of movsx */
2163   COSTS_N_INSNS (1),                    /* cost of movzx */
2164   8,                                    /* "large" insn */
2165   17,                                   /* MOVE_RATIO */
2166
2167   /* All move costs are relative to integer->integer move times 2 and thus
2168      they are latency*2. */
2169   8,                                    /* cost for loading QImode using movzbl */
2170   {8, 8, 8},                            /* cost of loading integer registers
2171                                            in QImode, HImode and SImode.
2172                                            Relative to reg-reg move (2).  */
2173   {6, 6, 6},                            /* cost of storing integer registers */
2174   2,                                    /* cost of reg,reg fld/fst */
2175   {8, 8, 18},                           /* cost of loading fp registers
2176                                            in SFmode, DFmode and XFmode */
2177   {6, 6, 18},                           /* cost of storing fp registers
2178                                            in SFmode, DFmode and XFmode */
2179   2,                                    /* cost of moving MMX register */
2180   {8, 8},                               /* cost of loading MMX registers
2181                                            in SImode and DImode */
2182   {6, 6},                               /* cost of storing MMX registers
2183                                            in SImode and DImode */
2184   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2185   {8, 8, 8, 16, 32},                    /* cost of loading SSE registers
2186                                            in 32,64,128,256 and 512-bit */
2187   {16, 16, 16, 32, 64},                 /* cost of unaligned loads.  */
2188   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
2189                                            in 32,64,128,256 and 512-bit */
2190   {16, 16, 16, 32, 64},                 /* cost of unaligned stores.  */
2191   8, 6,                                 /* SSE->integer and integer->SSE moves */
2192   8, 8,                                 /* Gather load static, per_elt.  */
2193   8, 8,                                 /* Gather store static, per_elt.  */
2194   32,                                   /* size of l1 cache.  */
2195   256,                                  /* size of l2 cache.  */
2196   64,                                   /* size of prefetch block */
2197   6,                                    /* number of parallel prefetches */
2198   3,                                    /* Branch cost */
2199   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2200   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2201   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2202   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2203   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2204   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2205
2206   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2207   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2208   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2209   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2210   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2211   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2212   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
2213   COSTS_N_INSNS (69),                   /* cost of DIVSD instruction.  */
2214   COSTS_N_INSNS (20),                   /* cost of SQRTSS instruction.  */
2215   COSTS_N_INSNS (35),                   /* cost of SQRTSD instruction.  */
2216   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2217   slm_memcpy,
2218   slm_memset,
2219   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2220   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2221   "16",                                 /* Loop alignment.  */
2222   "16:8:8",                             /* Jump alignment.  */
2223   "0:0:8",                              /* Label alignment.  */
2224   "16",                                 /* Func alignment.  */
2225 };
2226
2227 static stringop_algs intel_memcpy[2] = {
2228   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2229   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2230              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2231 static stringop_algs intel_memset[2] = {
2232   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2233              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2234   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2235              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2236 static const
2237 struct processor_costs intel_cost = {
2238   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2239   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2240   COSTS_N_INSNS (1),                    /* variable shift costs */
2241   COSTS_N_INSNS (1),                    /* constant shift costs */
2242   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2243    COSTS_N_INSNS (3),                   /*                               HI */
2244    COSTS_N_INSNS (3),                   /*                               SI */
2245    COSTS_N_INSNS (4),                   /*                               DI */
2246    COSTS_N_INSNS (2)},                  /*                            other */
2247   0,                                    /* cost of multiply per each bit set */
2248   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
2249    COSTS_N_INSNS (26),                  /*                          HI */
2250    COSTS_N_INSNS (42),                  /*                          SI */
2251    COSTS_N_INSNS (74),                  /*                          DI */
2252    COSTS_N_INSNS (74)},                 /*                          other */
2253   COSTS_N_INSNS (1),                    /* cost of movsx */
2254   COSTS_N_INSNS (1),                    /* cost of movzx */
2255   8,                                    /* "large" insn */
2256   17,                                   /* MOVE_RATIO */
2257
2258   /* All move costs are relative to integer->integer move times 2 and thus
2259      they are latency*2. */
2260   6,                                 /* cost for loading QImode using movzbl */
2261   {4, 4, 4},                            /* cost of loading integer registers
2262                                            in QImode, HImode and SImode.
2263                                            Relative to reg-reg move (2).  */
2264   {6, 6, 6},                            /* cost of storing integer registers */
2265   2,                                    /* cost of reg,reg fld/fst */
2266   {6, 6, 8},                            /* cost of loading fp registers
2267                                            in SFmode, DFmode and XFmode */
2268   {6, 6, 10},                           /* cost of storing fp registers
2269                                            in SFmode, DFmode and XFmode */
2270   2,                                    /* cost of moving MMX register */
2271   {6, 6},                               /* cost of loading MMX registers
2272                                            in SImode and DImode */
2273   {6, 6},                               /* cost of storing MMX registers
2274                                            in SImode and DImode */
2275   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM register */
2276   {6, 6, 6, 6, 6},                      /* cost of loading SSE registers
2277                                            in 32,64,128,256 and 512-bit */
2278   {10, 10, 10, 10, 10},                 /* cost of unaligned loads.  */
2279   {6, 6, 6, 6, 6},                      /* cost of storing SSE registers
2280                                            in 32,64,128,256 and 512-bit */
2281   {10, 10, 10, 10, 10},                 /* cost of unaligned loads.  */
2282   4, 4,                                 /* SSE->integer and integer->SSE moves */
2283   6, 6,                                 /* Gather load static, per_elt.  */
2284   6, 6,                                 /* Gather store static, per_elt.  */
2285   32,                                   /* size of l1 cache.  */
2286   256,                                  /* size of l2 cache.  */
2287   64,                                   /* size of prefetch block */
2288   6,                                    /* number of parallel prefetches */
2289   3,                                    /* Branch cost */
2290   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2291   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2292   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2293   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2294   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2295   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2296
2297   COSTS_N_INSNS (8),                    /* cost of cheap SSE instruction.  */
2298   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2299   COSTS_N_INSNS (8),                    /* cost of MULSS instruction.  */
2300   COSTS_N_INSNS (8),                    /* cost of MULSD instruction.  */
2301   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2302   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2303   COSTS_N_INSNS (20),                   /* cost of DIVSS instruction.  */
2304   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
2305   COSTS_N_INSNS (40),                   /* cost of SQRTSS instruction.  */
2306   COSTS_N_INSNS (40),                   /* cost of SQRTSD instruction.  */
2307   1, 4, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2308   intel_memcpy,
2309   intel_memset,
2310   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2311   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2312   "16",                                 /* Loop alignment.  */
2313   "16:8:8",                             /* Jump alignment.  */
2314   "0:0:8",                              /* Label alignment.  */
2315   "16",                                 /* Func alignment.  */
2316 };
2317
2318 /* Generic should produce code tuned for Core-i7 (and newer chips)
2319    and btver1 (and newer chips).  */
2320
2321 static stringop_algs generic_memcpy[2] = {
2322   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2323              {-1, libcall, false}}},
2324   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2325              {-1, libcall, false}}}};
2326 static stringop_algs generic_memset[2] = {
2327   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2328              {-1, libcall, false}}},
2329   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2330              {-1, libcall, false}}}};
2331 static const
2332 struct processor_costs generic_cost = {
2333   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2334   /* Setting cost to 2 makes our current implementation of synth_mult result in
2335      use of unnecessary temporary registers causing regression on several
2336      SPECfp benchmarks.  */
2337   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2338   COSTS_N_INSNS (1),                    /* variable shift costs */
2339   COSTS_N_INSNS (1),                    /* constant shift costs */
2340   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2341    COSTS_N_INSNS (4),                   /*                               HI */
2342    COSTS_N_INSNS (3),                   /*                               SI */
2343    COSTS_N_INSNS (4),                   /*                               DI */
2344    COSTS_N_INSNS (4)},                  /*                            other */
2345   0,                                    /* cost of multiply per each bit set */
2346   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI */
2347    COSTS_N_INSNS (22),                  /*                          HI */
2348    COSTS_N_INSNS (30),                  /*                          SI */
2349    COSTS_N_INSNS (74),                  /*                          DI */
2350    COSTS_N_INSNS (74)},                 /*                          other */
2351   COSTS_N_INSNS (1),                    /* cost of movsx */
2352   COSTS_N_INSNS (1),                    /* cost of movzx */
2353   8,                                    /* "large" insn */
2354   17,                                   /* MOVE_RATIO */
2355
2356   /* All move costs are relative to integer->integer move times 2 and thus
2357      they are latency*2. */
2358   6,                                 /* cost for loading QImode using movzbl */
2359   {6, 6, 6},                            /* cost of loading integer registers
2360                                            in QImode, HImode and SImode.
2361                                            Relative to reg-reg move (2).  */
2362   {6, 6, 6},                            /* cost of storing integer registers */
2363   4,                                    /* cost of reg,reg fld/fst */
2364   {6, 6, 12},                           /* cost of loading fp registers
2365                                            in SFmode, DFmode and XFmode */
2366   {6, 6, 12},                           /* cost of storing fp registers
2367                                            in SFmode, DFmode and XFmode */
2368   2,                                    /* cost of moving MMX register */
2369   {6, 6},                               /* cost of loading MMX registers
2370                                            in SImode and DImode */
2371   {6, 6},                               /* cost of storing MMX registers
2372                                            in SImode and DImode */
2373   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
2374   {6, 6, 6, 10, 15},                    /* cost of loading SSE registers
2375                                            in 32,64,128,256 and 512-bit */
2376   {6, 6, 6, 10, 15},                    /* cost of unaligned loads.  */
2377   {6, 6, 6, 10, 15},                    /* cost of storing SSE registers
2378                                            in 32,64,128,256 and 512-bit */
2379   {6, 6, 6, 10, 15},                    /* cost of unaligned storess.  */
2380   6, 6,                                 /* SSE->integer and integer->SSE moves */
2381   18, 6,                                /* Gather load static, per_elt.  */
2382   18, 6,                                /* Gather store static, per_elt.  */
2383   32,                                   /* size of l1 cache.  */
2384   512,                                  /* size of l2 cache.  */
2385   64,                                   /* size of prefetch block */
2386   6,                                    /* number of parallel prefetches */
2387   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2388      value is increased to perhaps more appropriate value of 5.  */
2389   3,                                    /* Branch cost */
2390   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2391   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
2392   COSTS_N_INSNS (17),                   /* cost of FDIV instruction.  */
2393   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2394   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2395   COSTS_N_INSNS (14),                   /* cost of FSQRT instruction.  */
2396
2397   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2398   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2399   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2400   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2401   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2402   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2403   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
2404   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
2405   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
2406   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
2407   1, 4, 3, 3,                           /* reassoc int, fp, vec_int, vec_fp.  */
2408   generic_memcpy,
2409   generic_memset,
2410   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
2411   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
2412   "16:11:8",                            /* Loop alignment.  */
2413   "16:11:8",                            /* Jump alignment.  */
2414   "0:0:8",                              /* Label alignment.  */
2415   "16",                                 /* Func alignment.  */
2416 };
2417
2418 /* core_cost should produce code tuned for Core familly of CPUs.  */
2419 static stringop_algs core_memcpy[2] = {
2420   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2421   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2422              {-1, libcall, false}}}};
2423 static stringop_algs core_memset[2] = {
2424   {libcall, {{6, loop_1_byte, true},
2425              {24, loop, true},
2426              {8192, rep_prefix_4_byte, true},
2427              {-1, libcall, false}}},
2428   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2429              {-1, libcall, false}}}};
2430
2431 static const
2432 struct processor_costs core_cost = {
2433   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2434   /* On all chips taken into consideration lea is 2 cycles and more.  With
2435      this cost however our current implementation of synth_mult results in
2436      use of unnecessary temporary registers causing regression on several
2437      SPECfp benchmarks.  */
2438   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2439   COSTS_N_INSNS (1),                    /* variable shift costs */
2440   COSTS_N_INSNS (1),                    /* constant shift costs */
2441   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2442    COSTS_N_INSNS (4),                   /*                               HI */
2443    COSTS_N_INSNS (3),                   /*                               SI */
2444    /* Here we tune for Sandybridge or newer.  */
2445    COSTS_N_INSNS (3),                   /*                               DI */
2446    COSTS_N_INSNS (3)},                  /*                            other */
2447   0,                                    /* cost of multiply per each bit set */
2448   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2449      model is not realistic. We compensate by increasing the latencies a bit.  */
2450   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
2451    COSTS_N_INSNS (11),                  /*                          HI */
2452    COSTS_N_INSNS (14),                  /*                          SI */
2453    COSTS_N_INSNS (81),                  /*                          DI */
2454    COSTS_N_INSNS (81)},                 /*                          other */
2455   COSTS_N_INSNS (1),                    /* cost of movsx */
2456   COSTS_N_INSNS (1),                    /* cost of movzx */
2457   8,                                    /* "large" insn */
2458   17,                                   /* MOVE_RATIO */
2459
2460   /* All move costs are relative to integer->integer move times 2 and thus
2461      they are latency*2. */
2462   6,                                 /* cost for loading QImode using movzbl */
2463   {4, 4, 4},                            /* cost of loading integer registers
2464                                            in QImode, HImode and SImode.
2465                                            Relative to reg-reg move (2).  */
2466   {6, 6, 6},                            /* cost of storing integer registers */
2467   2,                                    /* cost of reg,reg fld/fst */
2468   {6, 6, 8},                            /* cost of loading fp registers
2469                                            in SFmode, DFmode and XFmode */
2470   {6, 6, 10},                           /* cost of storing fp registers
2471                                            in SFmode, DFmode and XFmode */
2472   2,                                    /* cost of moving MMX register */
2473   {6, 6},                               /* cost of loading MMX registers
2474                                            in SImode and DImode */
2475   {6, 6},                               /* cost of storing MMX registers
2476                                            in SImode and DImode */
2477   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2478   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
2479                                            in 32,64,128,256 and 512-bit */
2480   {6, 6, 6, 6, 12},                     /* cost of unaligned loads.  */
2481   {6, 6, 6, 6, 12},                     /* cost of storing SSE registers
2482                                            in 32,64,128,256 and 512-bit */
2483   {6, 6, 6, 6, 12},                     /* cost of unaligned stores.  */
2484   2, 2,                                 /* SSE->integer and integer->SSE moves */
2485   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2486      rec. throughput 6.
2487      So 5 uops statically and one uops per load.  */
2488   10, 6,                                /* Gather load static, per_elt.  */
2489   10, 6,                                /* Gather store static, per_elt.  */
2490   64,                                   /* size of l1 cache.  */
2491   512,                                  /* size of l2 cache.  */
2492   64,                                   /* size of prefetch block */
2493   6,                                    /* number of parallel prefetches */
2494   /* FIXME perhaps more appropriate value is 5.  */
2495   3,                                    /* Branch cost */
2496   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2497   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
2498   /* 10-24 */
2499   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
2500   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2501   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2502   COSTS_N_INSNS (23),                   /* cost of FSQRT instruction.  */
2503
2504   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2505   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2506   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2507   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2508   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2509   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2510   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
2511   COSTS_N_INSNS (32),                   /* cost of DIVSD instruction.  */
2512   COSTS_N_INSNS (30),                   /* cost of SQRTSS instruction.  */
2513   COSTS_N_INSNS (58),                   /* cost of SQRTSD instruction.  */
2514   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
2515   core_memcpy,
2516   core_memset,
2517   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2518   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2519   "16:11:8",                            /* Loop alignment.  */
2520   "16:11:8",                            /* Jump alignment.  */
2521   "0:0:8",                              /* Label alignment.  */
2522   "16",                                 /* Func alignment.  */
2523 };
2524