gcc/config/i386/x86-tune-costs.h

   1
   2 /* Processor costs (relative to an add) */
   3 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
   4 #define COSTS_N_BYTES(N) ((N) * 2)
   5
   6 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
   7
   8 static stringop_algs ix86_size_memcpy[2] = {
   9   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  10   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  11 static stringop_algs ix86_size_memset[2] = {
  12   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  13   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  14
  15 const
  16 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
  17   COSTS_N_BYTES (2),                    /* cost of an add instruction */
  18   COSTS_N_BYTES (3),                    /* cost of a lea instruction */
  19   COSTS_N_BYTES (2),                    /* variable shift costs */
  20   COSTS_N_BYTES (3),                    /* constant shift costs */
  21   {COSTS_N_BYTES (3),                   /* cost of starting multiply for QI */
  22    COSTS_N_BYTES (3),                   /*                               HI */
  23    COSTS_N_BYTES (3),                   /*                               SI */
  24    COSTS_N_BYTES (3),                   /*                               DI */
  25    COSTS_N_BYTES (5)},                  /*                            other */
  26   0,                                    /* cost of multiply per each bit set */
  27   {COSTS_N_BYTES (3),                   /* cost of a divide/mod for QI */
  28    COSTS_N_BYTES (3),                   /*                          HI */
  29    COSTS_N_BYTES (3),                   /*                          SI */
  30    COSTS_N_BYTES (3),                   /*                          DI */
  31    COSTS_N_BYTES (5)},                  /*                          other */
  32   COSTS_N_BYTES (3),                    /* cost of movsx */
  33   COSTS_N_BYTES (3),                    /* cost of movzx */
  34   0,                                    /* "large" insn */
  35   2,                                    /* MOVE_RATIO */
  36   2,                                 /* cost for loading QImode using movzbl */
  37   {2, 2, 2},                            /* cost of loading integer registers
  38                                            in QImode, HImode and SImode.
  39                                            Relative to reg-reg move (2).  */
  40   {2, 2, 2},                            /* cost of storing integer registers */
  41   2,                                    /* cost of reg,reg fld/fst */
  42   {2, 2, 2},                            /* cost of loading fp registers
  43                                            in SFmode, DFmode and XFmode */
  44   {2, 2, 2},                            /* cost of storing fp registers
  45                                            in SFmode, DFmode and XFmode */
  46   3,                                    /* cost of moving MMX register */
  47   {3, 3},                               /* cost of loading MMX registers
  48                                            in SImode and DImode */
  49   {3, 3},                               /* cost of storing MMX registers
  50                                            in SImode and DImode */
  51   3,                                    /* cost of moving SSE register */
  52   {3, 3, 3},                            /* cost of loading SSE registers
  53                                            in SImode, DImode and TImode */
  54   {3, 3, 3},                            /* cost of storing SSE registers
  55                                            in SImode, DImode and TImode */
  56   3,                                    /* MMX or SSE register to integer */
  57   0,                                    /* size of l1 cache  */
  58   0,                                    /* size of l2 cache  */
  59   0,                                    /* size of prefetch block */
  60   0,                                    /* number of parallel prefetches */
  61   2,                                    /* Branch cost */
  62   COSTS_N_BYTES (2),                    /* cost of FADD and FSUB insns.  */
  63   COSTS_N_BYTES (2),                    /* cost of FMUL instruction.  */
  64   COSTS_N_BYTES (2),                    /* cost of FDIV instruction.  */
  65   COSTS_N_BYTES (2),                    /* cost of FABS instruction.  */
  66   COSTS_N_BYTES (2),                    /* cost of FCHS instruction.  */
  67   COSTS_N_BYTES (2),                    /* cost of FSQRT instruction.  */
  68   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
  69   ix86_size_memcpy,
  70   ix86_size_memset,
  71   1,                                    /* scalar_stmt_cost.  */
  72   1,                                    /* scalar load_cost.  */
  73   1,                                    /* scalar_store_cost.  */
  74   1,                                    /* vec_stmt_cost.  */
  75   1,                                    /* vec_to_scalar_cost.  */
  76   1,                                    /* scalar_to_vec_cost.  */
  77   1,                                    /* vec_align_load_cost.  */
  78   1,                                    /* vec_unalign_load_cost.  */
  79   1,                                    /* vec_store_cost.  */
  80   1,                                    /* cond_taken_branch_cost.  */
  81   1,                                    /* cond_not_taken_branch_cost.  */
  82 };
  83
  84 /* Processor costs (relative to an add) */
  85 static stringop_algs i386_memcpy[2] = {
  86   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  87   DUMMY_STRINGOP_ALGS};
  88 static stringop_algs i386_memset[2] = {
  89   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  90   DUMMY_STRINGOP_ALGS};
  91
  92 static const
  93 struct processor_costs i386_cost = {    /* 386 specific costs */
  94   COSTS_N_INSNS (1),                    /* cost of an add instruction */
  95   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
  96   COSTS_N_INSNS (3),                    /* variable shift costs */
  97   COSTS_N_INSNS (2),                    /* constant shift costs */
  98   {COSTS_N_INSNS (6),                   /* cost of starting multiply for QI */
  99    COSTS_N_INSNS (6),                   /*                               HI */
 100    COSTS_N_INSNS (6),                   /*                               SI */
 101    COSTS_N_INSNS (6),                   /*                               DI */
 102    COSTS_N_INSNS (6)},                  /*                            other */
 103   COSTS_N_INSNS (1),                    /* cost of multiply per each bit set */
 104   {COSTS_N_INSNS (23),                  /* cost of a divide/mod for QI */
 105    COSTS_N_INSNS (23),                  /*                          HI */
 106    COSTS_N_INSNS (23),                  /*                          SI */
 107    COSTS_N_INSNS (23),                  /*                          DI */
 108    COSTS_N_INSNS (23)},                 /*                          other */
 109   COSTS_N_INSNS (3),                    /* cost of movsx */
 110   COSTS_N_INSNS (2),                    /* cost of movzx */
 111   15,                                   /* "large" insn */
 112   3,                                    /* MOVE_RATIO */
 113   4,                                 /* cost for loading QImode using movzbl */
 114   {2, 4, 2},                            /* cost of loading integer registers
 115                                            in QImode, HImode and SImode.
 116                                            Relative to reg-reg move (2).  */
 117   {2, 4, 2},                            /* cost of storing integer registers */
 118   2,                                    /* cost of reg,reg fld/fst */
 119   {8, 8, 8},                            /* cost of loading fp registers
 120                                            in SFmode, DFmode and XFmode */
 121   {8, 8, 8},                            /* cost of storing fp registers
 122                                            in SFmode, DFmode and XFmode */
 123   2,                                    /* cost of moving MMX register */
 124   {4, 8},                               /* cost of loading MMX registers
 125                                            in SImode and DImode */
 126   {4, 8},                               /* cost of storing MMX registers
 127                                            in SImode and DImode */
 128   2,                                    /* cost of moving SSE register */
 129   {4, 8, 16},                           /* cost of loading SSE registers
 130                                            in SImode, DImode and TImode */
 131   {4, 8, 16},                           /* cost of storing SSE registers
 132                                            in SImode, DImode and TImode */
 133   3,                                    /* MMX or SSE register to integer */
 134   0,                                    /* size of l1 cache  */
 135   0,                                    /* size of l2 cache  */
 136   0,                                    /* size of prefetch block */
 137   0,                                    /* number of parallel prefetches */
 138   1,                                    /* Branch cost */
 139   COSTS_N_INSNS (23),                   /* cost of FADD and FSUB insns.  */
 140   COSTS_N_INSNS (27),                   /* cost of FMUL instruction.  */
 141   COSTS_N_INSNS (88),                   /* cost of FDIV instruction.  */
 142   COSTS_N_INSNS (22),                   /* cost of FABS instruction.  */
 143   COSTS_N_INSNS (24),                   /* cost of FCHS instruction.  */
 144   COSTS_N_INSNS (122),                  /* cost of FSQRT instruction.  */
 145   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 146   i386_memcpy,
 147   i386_memset,
 148   1,                                    /* scalar_stmt_cost.  */
 149   1,                                    /* scalar load_cost.  */
 150   1,                                    /* scalar_store_cost.  */
 151   1,                                    /* vec_stmt_cost.  */
 152   1,                                    /* vec_to_scalar_cost.  */
 153   1,                                    /* scalar_to_vec_cost.  */
 154   1,                                    /* vec_align_load_cost.  */
 155   2,                                    /* vec_unalign_load_cost.  */
 156   1,                                    /* vec_store_cost.  */
 157   3,                                    /* cond_taken_branch_cost.  */
 158   1,                                    /* cond_not_taken_branch_cost.  */
 159 };
 160
 161 static stringop_algs i486_memcpy[2] = {
 162   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 163   DUMMY_STRINGOP_ALGS};
 164 static stringop_algs i486_memset[2] = {
 165   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 166   DUMMY_STRINGOP_ALGS};
 167
 168 static const
 169 struct processor_costs i486_cost = {    /* 486 specific costs */
 170   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 171   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 172   COSTS_N_INSNS (3),                    /* variable shift costs */
 173   COSTS_N_INSNS (2),                    /* constant shift costs */
 174   {COSTS_N_INSNS (12),                  /* cost of starting multiply for QI */
 175    COSTS_N_INSNS (12),                  /*                               HI */
 176    COSTS_N_INSNS (12),                  /*                               SI */
 177    COSTS_N_INSNS (12),                  /*                               DI */
 178    COSTS_N_INSNS (12)},                 /*                            other */
 179   1,                                    /* cost of multiply per each bit set */
 180   {COSTS_N_INSNS (40),                  /* cost of a divide/mod for QI */
 181    COSTS_N_INSNS (40),                  /*                          HI */
 182    COSTS_N_INSNS (40),                  /*                          SI */
 183    COSTS_N_INSNS (40),                  /*                          DI */
 184    COSTS_N_INSNS (40)},                 /*                          other */
 185   COSTS_N_INSNS (3),                    /* cost of movsx */
 186   COSTS_N_INSNS (2),                    /* cost of movzx */
 187   15,                                   /* "large" insn */
 188   3,                                    /* MOVE_RATIO */
 189   4,                                 /* cost for loading QImode using movzbl */
 190   {2, 4, 2},                            /* cost of loading integer registers
 191                                            in QImode, HImode and SImode.
 192                                            Relative to reg-reg move (2).  */
 193   {2, 4, 2},                            /* cost of storing integer registers */
 194   2,                                    /* cost of reg,reg fld/fst */
 195   {8, 8, 8},                            /* cost of loading fp registers
 196                                            in SFmode, DFmode and XFmode */
 197   {8, 8, 8},                            /* cost of storing fp registers
 198                                            in SFmode, DFmode and XFmode */
 199   2,                                    /* cost of moving MMX register */
 200   {4, 8},                               /* cost of loading MMX registers
 201                                            in SImode and DImode */
 202   {4, 8},                               /* cost of storing MMX registers
 203                                            in SImode and DImode */
 204   2,                                    /* cost of moving SSE register */
 205   {4, 8, 16},                           /* cost of loading SSE registers
 206                                            in SImode, DImode and TImode */
 207   {4, 8, 16},                           /* cost of storing SSE registers
 208                                            in SImode, DImode and TImode */
 209   3,                                    /* MMX or SSE register to integer */
 210   4,                                    /* size of l1 cache.  486 has 8kB cache
 211                                            shared for code and data, so 4kB is
 212                                            not really precise.  */
 213   4,                                    /* size of l2 cache  */
 214   0,                                    /* size of prefetch block */
 215   0,                                    /* number of parallel prefetches */
 216   1,                                    /* Branch cost */
 217   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
 218   COSTS_N_INSNS (16),                   /* cost of FMUL instruction.  */
 219   COSTS_N_INSNS (73),                   /* cost of FDIV instruction.  */
 220   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
 221   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
 222   COSTS_N_INSNS (83),                   /* cost of FSQRT instruction.  */
 223   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 224   i486_memcpy,
 225   i486_memset,
 226   1,                                    /* scalar_stmt_cost.  */
 227   1,                                    /* scalar load_cost.  */
 228   1,                                    /* scalar_store_cost.  */
 229   1,                                    /* vec_stmt_cost.  */
 230   1,                                    /* vec_to_scalar_cost.  */
 231   1,                                    /* scalar_to_vec_cost.  */
 232   1,                                    /* vec_align_load_cost.  */
 233   2,                                    /* vec_unalign_load_cost.  */
 234   1,                                    /* vec_store_cost.  */
 235   3,                                    /* cond_taken_branch_cost.  */
 236   1,                                    /* cond_not_taken_branch_cost.  */
 237 };
 238
 239 static stringop_algs pentium_memcpy[2] = {
 240   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 241   DUMMY_STRINGOP_ALGS};
 242 static stringop_algs pentium_memset[2] = {
 243   {libcall, {{-1, rep_prefix_4_byte, false}}},
 244   DUMMY_STRINGOP_ALGS};
 245
 246 static const
 247 struct processor_costs pentium_cost = {
 248   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 249   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 250   COSTS_N_INSNS (4),                    /* variable shift costs */
 251   COSTS_N_INSNS (1),                    /* constant shift costs */
 252   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 253    COSTS_N_INSNS (11),                  /*                               HI */
 254    COSTS_N_INSNS (11),                  /*                               SI */
 255    COSTS_N_INSNS (11),                  /*                               DI */
 256    COSTS_N_INSNS (11)},                 /*                            other */
 257   0,                                    /* cost of multiply per each bit set */
 258   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 259    COSTS_N_INSNS (25),                  /*                          HI */
 260    COSTS_N_INSNS (25),                  /*                          SI */
 261    COSTS_N_INSNS (25),                  /*                          DI */
 262    COSTS_N_INSNS (25)},                 /*                          other */
 263   COSTS_N_INSNS (3),                    /* cost of movsx */
 264   COSTS_N_INSNS (2),                    /* cost of movzx */
 265   8,                                    /* "large" insn */
 266   6,                                    /* MOVE_RATIO */
 267   6,                                 /* cost for loading QImode using movzbl */
 268   {2, 4, 2},                            /* cost of loading integer registers
 269                                            in QImode, HImode and SImode.
 270                                            Relative to reg-reg move (2).  */
 271   {2, 4, 2},                            /* cost of storing integer registers */
 272   2,                                    /* cost of reg,reg fld/fst */
 273   {2, 2, 6},                            /* cost of loading fp registers
 274                                            in SFmode, DFmode and XFmode */
 275   {4, 4, 6},                            /* cost of storing fp registers
 276                                            in SFmode, DFmode and XFmode */
 277   8,                                    /* cost of moving MMX register */
 278   {8, 8},                               /* cost of loading MMX registers
 279                                            in SImode and DImode */
 280   {8, 8},                               /* cost of storing MMX registers
 281                                            in SImode and DImode */
 282   2,                                    /* cost of moving SSE register */
 283   {4, 8, 16},                           /* cost of loading SSE registers
 284                                            in SImode, DImode and TImode */
 285   {4, 8, 16},                           /* cost of storing SSE registers
 286                                            in SImode, DImode and TImode */
 287   3,                                    /* MMX or SSE register to integer */
 288   8,                                    /* size of l1 cache.  */
 289   8,                                    /* size of l2 cache  */
 290   0,                                    /* size of prefetch block */
 291   0,                                    /* number of parallel prefetches */
 292   2,                                    /* Branch cost */
 293   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 294   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 295   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 296   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 297   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 298   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 299   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 300   pentium_memcpy,
 301   pentium_memset,
 302   1,                                    /* scalar_stmt_cost.  */
 303   1,                                    /* scalar load_cost.  */
 304   1,                                    /* scalar_store_cost.  */
 305   1,                                    /* vec_stmt_cost.  */
 306   1,                                    /* vec_to_scalar_cost.  */
 307   1,                                    /* scalar_to_vec_cost.  */
 308   1,                                    /* vec_align_load_cost.  */
 309   2,                                    /* vec_unalign_load_cost.  */
 310   1,                                    /* vec_store_cost.  */
 311   3,                                    /* cond_taken_branch_cost.  */
 312   1,                                    /* cond_not_taken_branch_cost.  */
 313 };
 314
 315 static const
 316 struct processor_costs lakemont_cost = {
 317   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 318   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
 319   COSTS_N_INSNS (1),                    /* variable shift costs */
 320   COSTS_N_INSNS (1),                    /* constant shift costs */
 321   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 322    COSTS_N_INSNS (11),                  /*                               HI */
 323    COSTS_N_INSNS (11),                  /*                               SI */
 324    COSTS_N_INSNS (11),                  /*                               DI */
 325    COSTS_N_INSNS (11)},                 /*                            other */
 326   0,                                    /* cost of multiply per each bit set */
 327   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 328    COSTS_N_INSNS (25),                  /*                          HI */
 329    COSTS_N_INSNS (25),                  /*                          SI */
 330    COSTS_N_INSNS (25),                  /*                          DI */
 331    COSTS_N_INSNS (25)},                 /*                          other */
 332   COSTS_N_INSNS (3),                    /* cost of movsx */
 333   COSTS_N_INSNS (2),                    /* cost of movzx */
 334   8,                                    /* "large" insn */
 335   17,                                   /* MOVE_RATIO */
 336   6,                                 /* cost for loading QImode using movzbl */
 337   {2, 4, 2},                            /* cost of loading integer registers
 338                                            in QImode, HImode and SImode.
 339                                            Relative to reg-reg move (2).  */
 340   {2, 4, 2},                            /* cost of storing integer registers */
 341   2,                                    /* cost of reg,reg fld/fst */
 342   {2, 2, 6},                            /* cost of loading fp registers
 343                                            in SFmode, DFmode and XFmode */
 344   {4, 4, 6},                            /* cost of storing fp registers
 345                                            in SFmode, DFmode and XFmode */
 346   8,                                    /* cost of moving MMX register */
 347   {8, 8},                               /* cost of loading MMX registers
 348                                            in SImode and DImode */
 349   {8, 8},                               /* cost of storing MMX registers
 350                                            in SImode and DImode */
 351   2,                                    /* cost of moving SSE register */
 352   {4, 8, 16},                           /* cost of loading SSE registers
 353                                            in SImode, DImode and TImode */
 354   {4, 8, 16},                           /* cost of storing SSE registers
 355                                            in SImode, DImode and TImode */
 356   3,                                    /* MMX or SSE register to integer */
 357   8,                                    /* size of l1 cache.  */
 358   8,                                    /* size of l2 cache  */
 359   0,                                    /* size of prefetch block */
 360   0,                                    /* number of parallel prefetches */
 361   2,                                    /* Branch cost */
 362   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 363   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 364   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 365   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 366   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 367   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 368   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 369   pentium_memcpy,
 370   pentium_memset,
 371   1,                                    /* scalar_stmt_cost.  */
 372   1,                                    /* scalar load_cost.  */
 373   1,                                    /* scalar_store_cost.  */
 374   1,                                    /* vec_stmt_cost.  */
 375   1,                                    /* vec_to_scalar_cost.  */
 376   1,                                    /* scalar_to_vec_cost.  */
 377   1,                                    /* vec_align_load_cost.  */
 378   2,                                    /* vec_unalign_load_cost.  */
 379   1,                                    /* vec_store_cost.  */
 380   3,                                    /* cond_taken_branch_cost.  */
 381   1,                                    /* cond_not_taken_branch_cost.  */
 382 };
 383
 384 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
 385    (we ensure the alignment).  For small blocks inline loop is still a
 386    noticeable win, for bigger blocks either rep movsl or rep movsb is
 387    way to go.  Rep movsb has apparently more expensive startup time in CPU,
 388    but after 4K the difference is down in the noise.  */
 389 static stringop_algs pentiumpro_memcpy[2] = {
 390   {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
 391                        {8192, rep_prefix_4_byte, false},
 392                        {-1, rep_prefix_1_byte, false}}},
 393   DUMMY_STRINGOP_ALGS};
 394 static stringop_algs pentiumpro_memset[2] = {
 395   {rep_prefix_4_byte, {{1024, unrolled_loop, false},
 396                        {8192, rep_prefix_4_byte, false},
 397                        {-1, libcall, false}}},
 398   DUMMY_STRINGOP_ALGS};
 399 static const
 400 struct processor_costs pentiumpro_cost = {
 401   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 402   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 403   COSTS_N_INSNS (1),                    /* variable shift costs */
 404   COSTS_N_INSNS (1),                    /* constant shift costs */
 405   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
 406    COSTS_N_INSNS (4),                   /*                               HI */
 407    COSTS_N_INSNS (4),                   /*                               SI */
 408    COSTS_N_INSNS (4),                   /*                               DI */
 409    COSTS_N_INSNS (4)},                  /*                            other */
 410   0,                                    /* cost of multiply per each bit set */
 411   {COSTS_N_INSNS (17),                  /* cost of a divide/mod for QI */
 412    COSTS_N_INSNS (17),                  /*                          HI */
 413    COSTS_N_INSNS (17),                  /*                          SI */
 414    COSTS_N_INSNS (17),                  /*                          DI */
 415    COSTS_N_INSNS (17)},                 /*                          other */
 416   COSTS_N_INSNS (1),                    /* cost of movsx */
 417   COSTS_N_INSNS (1),                    /* cost of movzx */
 418   8,                                    /* "large" insn */
 419   6,                                    /* MOVE_RATIO */
 420   2,                                 /* cost for loading QImode using movzbl */
 421   {4, 4, 4},                            /* cost of loading integer registers
 422                                            in QImode, HImode and SImode.
 423                                            Relative to reg-reg move (2).  */
 424   {2, 2, 2},                            /* cost of storing integer registers */
 425   2,                                    /* cost of reg,reg fld/fst */
 426   {2, 2, 6},                            /* cost of loading fp registers
 427                                            in SFmode, DFmode and XFmode */
 428   {4, 4, 6},                            /* cost of storing fp registers
 429                                            in SFmode, DFmode and XFmode */
 430   2,                                    /* cost of moving MMX register */
 431   {2, 2},                               /* cost of loading MMX registers
 432                                            in SImode and DImode */
 433   {2, 2},                               /* cost of storing MMX registers
 434                                            in SImode and DImode */
 435   2,                                    /* cost of moving SSE register */
 436   {2, 2, 8},                            /* cost of loading SSE registers
 437                                            in SImode, DImode and TImode */
 438   {2, 2, 8},                            /* cost of storing SSE registers
 439                                            in SImode, DImode and TImode */
 440   3,                                    /* MMX or SSE register to integer */
 441   8,                                    /* size of l1 cache.  */
 442   256,                                  /* size of l2 cache  */
 443   32,                                   /* size of prefetch block */
 444   6,                                    /* number of parallel prefetches */
 445   2,                                    /* Branch cost */
 446   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 447   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
 448   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 449   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 450   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 451   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 452   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 453   pentiumpro_memcpy,
 454   pentiumpro_memset,
 455   1,                                    /* scalar_stmt_cost.  */
 456   1,                                    /* scalar load_cost.  */
 457   1,                                    /* scalar_store_cost.  */
 458   1,                                    /* vec_stmt_cost.  */
 459   1,                                    /* vec_to_scalar_cost.  */
 460   1,                                    /* scalar_to_vec_cost.  */
 461   1,                                    /* vec_align_load_cost.  */
 462   2,                                    /* vec_unalign_load_cost.  */
 463   1,                                    /* vec_store_cost.  */
 464   3,                                    /* cond_taken_branch_cost.  */
 465   1,                                    /* cond_not_taken_branch_cost.  */
 466 };
 467
 468 static stringop_algs geode_memcpy[2] = {
 469   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 470   DUMMY_STRINGOP_ALGS};
 471 static stringop_algs geode_memset[2] = {
 472   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 473   DUMMY_STRINGOP_ALGS};
 474 static const
 475 struct processor_costs geode_cost = {
 476   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 477   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 478   COSTS_N_INSNS (2),                    /* variable shift costs */
 479   COSTS_N_INSNS (1),                    /* constant shift costs */
 480   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 481    COSTS_N_INSNS (4),                   /*                               HI */
 482    COSTS_N_INSNS (7),                   /*                               SI */
 483    COSTS_N_INSNS (7),                   /*                               DI */
 484    COSTS_N_INSNS (7)},                  /*                            other */
 485   0,                                    /* cost of multiply per each bit set */
 486   {COSTS_N_INSNS (15),                  /* cost of a divide/mod for QI */
 487    COSTS_N_INSNS (23),                  /*                          HI */
 488    COSTS_N_INSNS (39),                  /*                          SI */
 489    COSTS_N_INSNS (39),                  /*                          DI */
 490    COSTS_N_INSNS (39)},                 /*                          other */
 491   COSTS_N_INSNS (1),                    /* cost of movsx */
 492   COSTS_N_INSNS (1),                    /* cost of movzx */
 493   8,                                    /* "large" insn */
 494   4,                                    /* MOVE_RATIO */
 495   1,                                 /* cost for loading QImode using movzbl */
 496   {1, 1, 1},                            /* cost of loading integer registers
 497                                            in QImode, HImode and SImode.
 498                                            Relative to reg-reg move (2).  */
 499   {1, 1, 1},                            /* cost of storing integer registers */
 500   1,                                    /* cost of reg,reg fld/fst */
 501   {1, 1, 1},                            /* cost of loading fp registers
 502                                            in SFmode, DFmode and XFmode */
 503   {4, 6, 6},                            /* cost of storing fp registers
 504                                            in SFmode, DFmode and XFmode */
 505
 506   2,                                    /* cost of moving MMX register */
 507   {2, 2},                               /* cost of loading MMX registers
 508                                            in SImode and DImode */
 509   {2, 2},                               /* cost of storing MMX registers
 510                                            in SImode and DImode */
 511   2,                                    /* cost of moving SSE register */
 512   {2, 2, 8},                            /* cost of loading SSE registers
 513                                            in SImode, DImode and TImode */
 514   {2, 2, 8},                            /* cost of storing SSE registers
 515                                            in SImode, DImode and TImode */
 516   3,                                    /* MMX or SSE register to integer */
 517   64,                                   /* size of l1 cache.  */
 518   128,                                  /* size of l2 cache.  */
 519   32,                                   /* size of prefetch block */
 520   1,                                    /* number of parallel prefetches */
 521   1,                                    /* Branch cost */
 522   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
 523   COSTS_N_INSNS (11),                   /* cost of FMUL instruction.  */
 524   COSTS_N_INSNS (47),                   /* cost of FDIV instruction.  */
 525   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 526   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 527   COSTS_N_INSNS (54),                   /* cost of FSQRT instruction.  */
 528   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 529   geode_memcpy,
 530   geode_memset,
 531   1,                                    /* scalar_stmt_cost.  */
 532   1,                                    /* scalar load_cost.  */
 533   1,                                    /* scalar_store_cost.  */
 534   1,                                    /* vec_stmt_cost.  */
 535   1,                                    /* vec_to_scalar_cost.  */
 536   1,                                    /* scalar_to_vec_cost.  */
 537   1,                                    /* vec_align_load_cost.  */
 538   2,                                    /* vec_unalign_load_cost.  */
 539   1,                                    /* vec_store_cost.  */
 540   3,                                    /* cond_taken_branch_cost.  */
 541   1,                                    /* cond_not_taken_branch_cost.  */
 542 };
 543
 544 static stringop_algs k6_memcpy[2] = {
 545   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 546   DUMMY_STRINGOP_ALGS};
 547 static stringop_algs k6_memset[2] = {
 548   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 549   DUMMY_STRINGOP_ALGS};
 550 static const
 551 struct processor_costs k6_cost = {
 552   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 553   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 554   COSTS_N_INSNS (1),                    /* variable shift costs */
 555   COSTS_N_INSNS (1),                    /* constant shift costs */
 556   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 557    COSTS_N_INSNS (3),                   /*                               HI */
 558    COSTS_N_INSNS (3),                   /*                               SI */
 559    COSTS_N_INSNS (3),                   /*                               DI */
 560    COSTS_N_INSNS (3)},                  /*                            other */
 561   0,                                    /* cost of multiply per each bit set */
 562   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 563    COSTS_N_INSNS (18),                  /*                          HI */
 564    COSTS_N_INSNS (18),                  /*                          SI */
 565    COSTS_N_INSNS (18),                  /*                          DI */
 566    COSTS_N_INSNS (18)},                 /*                          other */
 567   COSTS_N_INSNS (2),                    /* cost of movsx */
 568   COSTS_N_INSNS (2),                    /* cost of movzx */
 569   8,                                    /* "large" insn */
 570   4,                                    /* MOVE_RATIO */
 571   3,                                 /* cost for loading QImode using movzbl */
 572   {4, 5, 4},                            /* cost of loading integer registers
 573                                            in QImode, HImode and SImode.
 574                                            Relative to reg-reg move (2).  */
 575   {2, 3, 2},                            /* cost of storing integer registers */
 576   4,                                    /* cost of reg,reg fld/fst */
 577   {6, 6, 6},                            /* cost of loading fp registers
 578                                            in SFmode, DFmode and XFmode */
 579   {4, 4, 4},                            /* cost of storing fp registers
 580                                            in SFmode, DFmode and XFmode */
 581   2,                                    /* cost of moving MMX register */
 582   {2, 2},                               /* cost of loading MMX registers
 583                                            in SImode and DImode */
 584   {2, 2},                               /* cost of storing MMX registers
 585                                            in SImode and DImode */
 586   2,                                    /* cost of moving SSE register */
 587   {2, 2, 8},                            /* cost of loading SSE registers
 588                                            in SImode, DImode and TImode */
 589   {2, 2, 8},                            /* cost of storing SSE registers
 590                                            in SImode, DImode and TImode */
 591   6,                                    /* MMX or SSE register to integer */
 592   32,                                   /* size of l1 cache.  */
 593   32,                                   /* size of l2 cache.  Some models
 594                                            have integrated l2 cache, but
 595                                            optimizing for k6 is not important
 596                                            enough to worry about that.  */
 597   32,                                   /* size of prefetch block */
 598   1,                                    /* number of parallel prefetches */
 599   1,                                    /* Branch cost */
 600   COSTS_N_INSNS (2),                    /* cost of FADD and FSUB insns.  */
 601   COSTS_N_INSNS (2),                    /* cost of FMUL instruction.  */
 602   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 603   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 604   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 605   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 606   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 607   k6_memcpy,
 608   k6_memset,
 609   1,                                    /* scalar_stmt_cost.  */
 610   1,                                    /* scalar load_cost.  */
 611   1,                                    /* scalar_store_cost.  */
 612   1,                                    /* vec_stmt_cost.  */
 613   1,                                    /* vec_to_scalar_cost.  */
 614   1,                                    /* scalar_to_vec_cost.  */
 615   1,                                    /* vec_align_load_cost.  */
 616   2,                                    /* vec_unalign_load_cost.  */
 617   1,                                    /* vec_store_cost.  */
 618   3,                                    /* cond_taken_branch_cost.  */
 619   1,                                    /* cond_not_taken_branch_cost.  */
 620 };
 621
 622 /* For some reason, Athlon deals better with REP prefix (relative to loops)
 623    compared to K8. Alignment becomes important after 8 bytes for memcpy and
 624    128 bytes for memset.  */
 625 static stringop_algs athlon_memcpy[2] = {
 626   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 627   DUMMY_STRINGOP_ALGS};
 628 static stringop_algs athlon_memset[2] = {
 629   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 630   DUMMY_STRINGOP_ALGS};
 631 static const
 632 struct processor_costs athlon_cost = {
 633   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 634   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 635   COSTS_N_INSNS (1),                    /* variable shift costs */
 636   COSTS_N_INSNS (1),                    /* constant shift costs */
 637   {COSTS_N_INSNS (5),                   /* cost of starting multiply for QI */
 638    COSTS_N_INSNS (5),                   /*                               HI */
 639    COSTS_N_INSNS (5),                   /*                               SI */
 640    COSTS_N_INSNS (5),                   /*                               DI */
 641    COSTS_N_INSNS (5)},                  /*                            other */
 642   0,                                    /* cost of multiply per each bit set */
 643   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 644    COSTS_N_INSNS (26),                  /*                          HI */
 645    COSTS_N_INSNS (42),                  /*                          SI */
 646    COSTS_N_INSNS (74),                  /*                          DI */
 647    COSTS_N_INSNS (74)},                 /*                          other */
 648   COSTS_N_INSNS (1),                    /* cost of movsx */
 649   COSTS_N_INSNS (1),                    /* cost of movzx */
 650   8,                                    /* "large" insn */
 651   9,                                    /* MOVE_RATIO */
 652   4,                                 /* cost for loading QImode using movzbl */
 653   {3, 4, 3},                            /* cost of loading integer registers
 654                                            in QImode, HImode and SImode.
 655                                            Relative to reg-reg move (2).  */
 656   {3, 4, 3},                            /* cost of storing integer registers */
 657   4,                                    /* cost of reg,reg fld/fst */
 658   {4, 4, 12},                           /* cost of loading fp registers
 659                                            in SFmode, DFmode and XFmode */
 660   {6, 6, 8},                            /* cost of storing fp registers
 661                                            in SFmode, DFmode and XFmode */
 662   2,                                    /* cost of moving MMX register */
 663   {4, 4},                               /* cost of loading MMX registers
 664                                            in SImode and DImode */
 665   {4, 4},                               /* cost of storing MMX registers
 666                                            in SImode and DImode */
 667   2,                                    /* cost of moving SSE register */
 668   {4, 4, 6},                            /* cost of loading SSE registers
 669                                            in SImode, DImode and TImode */
 670   {4, 4, 5},                            /* cost of storing SSE registers
 671                                            in SImode, DImode and TImode */
 672   5,                                    /* MMX or SSE register to integer */
 673   64,                                   /* size of l1 cache.  */
 674   256,                                  /* size of l2 cache.  */
 675   64,                                   /* size of prefetch block */
 676   6,                                    /* number of parallel prefetches */
 677   5,                                    /* Branch cost */
 678   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 679   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 680   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
 681   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 682   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 683   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 684   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 685   athlon_memcpy,
 686   athlon_memset,
 687   1,                                    /* scalar_stmt_cost.  */
 688   1,                                    /* scalar load_cost.  */
 689   1,                                    /* scalar_store_cost.  */
 690   1,                                    /* vec_stmt_cost.  */
 691   1,                                    /* vec_to_scalar_cost.  */
 692   1,                                    /* scalar_to_vec_cost.  */
 693   1,                                    /* vec_align_load_cost.  */
 694   2,                                    /* vec_unalign_load_cost.  */
 695   1,                                    /* vec_store_cost.  */
 696   3,                                    /* cond_taken_branch_cost.  */
 697   1,                                    /* cond_not_taken_branch_cost.  */
 698 };
 699
 700 /* K8 has optimized REP instruction for medium sized blocks, but for very
 701    small blocks it is better to use loop. For large blocks, libcall can
 702    do nontemporary accesses and beat inline considerably.  */
 703 static stringop_algs k8_memcpy[2] = {
 704   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
 705              {-1, rep_prefix_4_byte, false}}},
 706   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
 707              {-1, libcall, false}}}};
 708 static stringop_algs k8_memset[2] = {
 709   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 710              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 711   {libcall, {{48, unrolled_loop, false},
 712              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 713 static const
 714 struct processor_costs k8_cost = {
 715   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 716   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 717   COSTS_N_INSNS (1),                    /* variable shift costs */
 718   COSTS_N_INSNS (1),                    /* constant shift costs */
 719   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 720    COSTS_N_INSNS (4),                   /*                               HI */
 721    COSTS_N_INSNS (3),                   /*                               SI */
 722    COSTS_N_INSNS (4),                   /*                               DI */
 723    COSTS_N_INSNS (5)},                  /*                            other */
 724   0,                                    /* cost of multiply per each bit set */
 725   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 726    COSTS_N_INSNS (26),                  /*                          HI */
 727    COSTS_N_INSNS (42),                  /*                          SI */
 728    COSTS_N_INSNS (74),                  /*                          DI */
 729    COSTS_N_INSNS (74)},                 /*                          other */
 730   COSTS_N_INSNS (1),                    /* cost of movsx */
 731   COSTS_N_INSNS (1),                    /* cost of movzx */
 732   8,                                    /* "large" insn */
 733   9,                                    /* MOVE_RATIO */
 734   4,                                 /* cost for loading QImode using movzbl */
 735   {3, 4, 3},                            /* cost of loading integer registers
 736                                            in QImode, HImode and SImode.
 737                                            Relative to reg-reg move (2).  */
 738   {3, 4, 3},                            /* cost of storing integer registers */
 739   4,                                    /* cost of reg,reg fld/fst */
 740   {4, 4, 12},                           /* cost of loading fp registers
 741                                            in SFmode, DFmode and XFmode */
 742   {6, 6, 8},                            /* cost of storing fp registers
 743                                            in SFmode, DFmode and XFmode */
 744   2,                                    /* cost of moving MMX register */
 745   {3, 3},                               /* cost of loading MMX registers
 746                                            in SImode and DImode */
 747   {4, 4},                               /* cost of storing MMX registers
 748                                            in SImode and DImode */
 749   2,                                    /* cost of moving SSE register */
 750   {4, 3, 6},                            /* cost of loading SSE registers
 751                                            in SImode, DImode and TImode */
 752   {4, 4, 5},                            /* cost of storing SSE registers
 753                                            in SImode, DImode and TImode */
 754   5,                                    /* MMX or SSE register to integer */
 755   64,                                   /* size of l1 cache.  */
 756   512,                                  /* size of l2 cache.  */
 757   64,                                   /* size of prefetch block */
 758   /* New AMD processors never drop prefetches; if they cannot be performed
 759      immediately, they are queued.  We set number of simultaneous prefetches
 760      to a large constant to reflect this (it probably is not a good idea not
 761      to limit number of prefetches at all, as their execution also takes some
 762      time).  */
 763   100,                                  /* number of parallel prefetches */
 764   3,                                    /* Branch cost */
 765   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 766   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 767   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
 768   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 769   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 770   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 771   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 772   k8_memcpy,
 773   k8_memset,
 774   4,                                    /* scalar_stmt_cost.  */
 775   2,                                    /* scalar load_cost.  */
 776   2,                                    /* scalar_store_cost.  */
 777   5,                                    /* vec_stmt_cost.  */
 778   0,                                    /* vec_to_scalar_cost.  */
 779   2,                                    /* scalar_to_vec_cost.  */
 780   2,                                    /* vec_align_load_cost.  */
 781   3,                                    /* vec_unalign_load_cost.  */
 782   3,                                    /* vec_store_cost.  */
 783   3,                                    /* cond_taken_branch_cost.  */
 784   2,                                    /* cond_not_taken_branch_cost.  */
 785 };
 786
 787 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
 788    very small blocks it is better to use loop. For large blocks, libcall can
 789    do nontemporary accesses and beat inline considerably.  */
 790 static stringop_algs amdfam10_memcpy[2] = {
 791   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
 792              {-1, rep_prefix_4_byte, false}}},
 793   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
 794              {-1, libcall, false}}}};
 795 static stringop_algs amdfam10_memset[2] = {
 796   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 797              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 798   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
 799              {-1, libcall, false}}}};
 800 struct processor_costs amdfam10_cost = {
 801   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 802   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 803   COSTS_N_INSNS (1),                    /* variable shift costs */
 804   COSTS_N_INSNS (1),                    /* constant shift costs */
 805   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 806    COSTS_N_INSNS (4),                   /*                               HI */
 807    COSTS_N_INSNS (3),                   /*                               SI */
 808    COSTS_N_INSNS (4),                   /*                               DI */
 809    COSTS_N_INSNS (5)},                  /*                            other */
 810   0,                                    /* cost of multiply per each bit set */
 811   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
 812    COSTS_N_INSNS (35),                  /*                          HI */
 813    COSTS_N_INSNS (51),                  /*                          SI */
 814    COSTS_N_INSNS (83),                  /*                          DI */
 815    COSTS_N_INSNS (83)},                 /*                          other */
 816   COSTS_N_INSNS (1),                    /* cost of movsx */
 817   COSTS_N_INSNS (1),                    /* cost of movzx */
 818   8,                                    /* "large" insn */
 819   9,                                    /* MOVE_RATIO */
 820   4,                                 /* cost for loading QImode using movzbl */
 821   {3, 4, 3},                            /* cost of loading integer registers
 822                                            in QImode, HImode and SImode.
 823                                            Relative to reg-reg move (2).  */
 824   {3, 4, 3},                            /* cost of storing integer registers */
 825   4,                                    /* cost of reg,reg fld/fst */
 826   {4, 4, 12},                           /* cost of loading fp registers
 827                                            in SFmode, DFmode and XFmode */
 828   {6, 6, 8},                            /* cost of storing fp registers
 829                                            in SFmode, DFmode and XFmode */
 830   2,                                    /* cost of moving MMX register */
 831   {3, 3},                               /* cost of loading MMX registers
 832                                            in SImode and DImode */
 833   {4, 4},                               /* cost of storing MMX registers
 834                                            in SImode and DImode */
 835   2,                                    /* cost of moving SSE register */
 836   {4, 4, 3},                            /* cost of loading SSE registers
 837                                            in SImode, DImode and TImode */
 838   {4, 4, 5},                            /* cost of storing SSE registers
 839                                            in SImode, DImode and TImode */
 840   3,                                    /* MMX or SSE register to integer */
 841                                         /* On K8:
 842                                             MOVD reg64, xmmreg Double FSTORE 4
 843                                             MOVD reg32, xmmreg Double FSTORE 4
 844                                            On AMDFAM10:
 845                                             MOVD reg64, xmmreg Double FADD 3
 846                                                                1/1  1/1
 847                                             MOVD reg32, xmmreg Double FADD 3
 848                                                                1/1  1/1 */
 849   64,                                   /* size of l1 cache.  */
 850   512,                                  /* size of l2 cache.  */
 851   64,                                   /* size of prefetch block */
 852   /* New AMD processors never drop prefetches; if they cannot be performed
 853      immediately, they are queued.  We set number of simultaneous prefetches
 854      to a large constant to reflect this (it probably is not a good idea not
 855      to limit number of prefetches at all, as their execution also takes some
 856      time).  */
 857   100,                                  /* number of parallel prefetches */
 858   2,                                    /* Branch cost */
 859   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 860   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 861   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
 862   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 863   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 864   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 865   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 866   amdfam10_memcpy,
 867   amdfam10_memset,
 868   4,                                    /* scalar_stmt_cost.  */
 869   2,                                    /* scalar load_cost.  */
 870   2,                                    /* scalar_store_cost.  */
 871   6,                                    /* vec_stmt_cost.  */
 872   0,                                    /* vec_to_scalar_cost.  */
 873   2,                                    /* scalar_to_vec_cost.  */
 874   2,                                    /* vec_align_load_cost.  */
 875   2,                                    /* vec_unalign_load_cost.  */
 876   2,                                    /* vec_store_cost.  */
 877   2,                                    /* cond_taken_branch_cost.  */
 878   1,                                    /* cond_not_taken_branch_cost.  */
 879 };
 880
 881 /*  BDVER1 has optimized REP instruction for medium sized blocks, but for
 882     very small blocks it is better to use loop. For large blocks, libcall
 883     can do nontemporary accesses and beat inline considerably.  */
 884 static stringop_algs bdver1_memcpy[2] = {
 885   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
 886              {-1, rep_prefix_4_byte, false}}},
 887   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
 888              {-1, libcall, false}}}};
 889 static stringop_algs bdver1_memset[2] = {
 890   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 891              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 892   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
 893              {-1, libcall, false}}}};
 894
 895 const struct processor_costs bdver1_cost = {
 896   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 897   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 898   COSTS_N_INSNS (1),                    /* variable shift costs */
 899   COSTS_N_INSNS (1),                    /* constant shift costs */
 900   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
 901    COSTS_N_INSNS (4),                   /*                               HI */
 902    COSTS_N_INSNS (4),                   /*                               SI */
 903    COSTS_N_INSNS (6),                   /*                               DI */
 904    COSTS_N_INSNS (6)},                  /*                            other */
 905   0,                                    /* cost of multiply per each bit set */
 906   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
 907    COSTS_N_INSNS (35),                  /*                          HI */
 908    COSTS_N_INSNS (51),                  /*                          SI */
 909    COSTS_N_INSNS (83),                  /*                          DI */
 910    COSTS_N_INSNS (83)},                 /*                          other */
 911   COSTS_N_INSNS (1),                    /* cost of movsx */
 912   COSTS_N_INSNS (1),                    /* cost of movzx */
 913   8,                                    /* "large" insn */
 914   9,                                    /* MOVE_RATIO */
 915   4,                                 /* cost for loading QImode using movzbl */
 916   {5, 5, 4},                            /* cost of loading integer registers
 917                                            in QImode, HImode and SImode.
 918                                            Relative to reg-reg move (2).  */
 919   {4, 4, 4},                            /* cost of storing integer registers */
 920   2,                                    /* cost of reg,reg fld/fst */
 921   {5, 5, 12},                           /* cost of loading fp registers
 922                                            in SFmode, DFmode and XFmode */
 923   {4, 4, 8},                            /* cost of storing fp registers
 924                                            in SFmode, DFmode and XFmode */
 925   2,                                    /* cost of moving MMX register */
 926   {4, 4},                               /* cost of loading MMX registers
 927                                            in SImode and DImode */
 928   {4, 4},                               /* cost of storing MMX registers
 929                                            in SImode and DImode */
 930   2,                                    /* cost of moving SSE register */
 931   {4, 4, 4},                            /* cost of loading SSE registers
 932                                            in SImode, DImode and TImode */
 933   {4, 4, 4},                            /* cost of storing SSE registers
 934                                            in SImode, DImode and TImode */
 935   2,                                    /* MMX or SSE register to integer */
 936                                         /* On K8:
 937                                             MOVD reg64, xmmreg Double FSTORE 4
 938                                             MOVD reg32, xmmreg Double FSTORE 4
 939                                            On AMDFAM10:
 940                                             MOVD reg64, xmmreg Double FADD 3
 941                                                                1/1  1/1
 942                                             MOVD reg32, xmmreg Double FADD 3
 943                                                                1/1  1/1 */
 944   16,                                   /* size of l1 cache.  */
 945   2048,                                 /* size of l2 cache.  */
 946   64,                                   /* size of prefetch block */
 947   /* New AMD processors never drop prefetches; if they cannot be performed
 948      immediately, they are queued.  We set number of simultaneous prefetches
 949      to a large constant to reflect this (it probably is not a good idea not
 950      to limit number of prefetches at all, as their execution also takes some
 951      time).  */
 952   100,                                  /* number of parallel prefetches */
 953   2,                                    /* Branch cost */
 954   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
 955   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
 956   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
 957   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 958   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 959   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
 960   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 961   bdver1_memcpy,
 962   bdver1_memset,
 963   6,                                    /* scalar_stmt_cost.  */
 964   4,                                    /* scalar load_cost.  */
 965   4,                                    /* scalar_store_cost.  */
 966   6,                                    /* vec_stmt_cost.  */
 967   0,                                    /* vec_to_scalar_cost.  */
 968   2,                                    /* scalar_to_vec_cost.  */
 969   4,                                    /* vec_align_load_cost.  */
 970   4,                                    /* vec_unalign_load_cost.  */
 971   4,                                    /* vec_store_cost.  */
 972   4,                                    /* cond_taken_branch_cost.  */
 973   2,                                    /* cond_not_taken_branch_cost.  */
 974 };
 975
 976 /*  BDVER2 has optimized REP instruction for medium sized blocks, but for
 977     very small blocks it is better to use loop. For large blocks, libcall
 978     can do nontemporary accesses and beat inline considerably.  */
 979
 980 static stringop_algs bdver2_memcpy[2] = {
 981   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
 982              {-1, rep_prefix_4_byte, false}}},
 983   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
 984              {-1, libcall, false}}}};
 985 static stringop_algs bdver2_memset[2] = {
 986   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 987              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 988   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
 989              {-1, libcall, false}}}};
 990
 991 const struct processor_costs bdver2_cost = {
 992   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 993   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 994   COSTS_N_INSNS (1),                    /* variable shift costs */
 995   COSTS_N_INSNS (1),                    /* constant shift costs */
 996   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
 997    COSTS_N_INSNS (4),                   /*                               HI */
 998    COSTS_N_INSNS (4),                   /*                               SI */
 999    COSTS_N_INSNS (6),                   /*                               DI */
1000    COSTS_N_INSNS (6)},                  /*                            other */
1001   0,                                    /* cost of multiply per each bit set */
1002   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1003    COSTS_N_INSNS (35),                  /*                          HI */
1004    COSTS_N_INSNS (51),                  /*                          SI */
1005    COSTS_N_INSNS (83),                  /*                          DI */
1006    COSTS_N_INSNS (83)},                 /*                          other */
1007   COSTS_N_INSNS (1),                    /* cost of movsx */
1008   COSTS_N_INSNS (1),                    /* cost of movzx */
1009   8,                                    /* "large" insn */
1010   9,                                    /* MOVE_RATIO */
1011   4,                                 /* cost for loading QImode using movzbl */
1012   {5, 5, 4},                            /* cost of loading integer registers
1013                                            in QImode, HImode and SImode.
1014                                            Relative to reg-reg move (2).  */
1015   {4, 4, 4},                            /* cost of storing integer registers */
1016   2,                                    /* cost of reg,reg fld/fst */
1017   {5, 5, 12},                           /* cost of loading fp registers
1018                                            in SFmode, DFmode and XFmode */
1019   {4, 4, 8},                            /* cost of storing fp registers
1020                                            in SFmode, DFmode and XFmode */
1021   2,                                    /* cost of moving MMX register */
1022   {4, 4},                               /* cost of loading MMX registers
1023                                            in SImode and DImode */
1024   {4, 4},                               /* cost of storing MMX registers
1025                                            in SImode and DImode */
1026   2,                                    /* cost of moving SSE register */
1027   {4, 4, 4},                            /* cost of loading SSE registers
1028                                            in SImode, DImode and TImode */
1029   {4, 4, 4},                            /* cost of storing SSE registers
1030                                            in SImode, DImode and TImode */
1031   2,                                    /* MMX or SSE register to integer */
1032                                         /* On K8:
1033                                             MOVD reg64, xmmreg Double FSTORE 4
1034                                             MOVD reg32, xmmreg Double FSTORE 4
1035                                            On AMDFAM10:
1036                                             MOVD reg64, xmmreg Double FADD 3
1037                                                                1/1  1/1
1038                                             MOVD reg32, xmmreg Double FADD 3
1039                                                                1/1  1/1 */
1040   16,                                   /* size of l1 cache.  */
1041   2048,                                 /* size of l2 cache.  */
1042   64,                                   /* size of prefetch block */
1043   /* New AMD processors never drop prefetches; if they cannot be performed
1044      immediately, they are queued.  We set number of simultaneous prefetches
1045      to a large constant to reflect this (it probably is not a good idea not
1046      to limit number of prefetches at all, as their execution also takes some
1047      time).  */
1048   100,                                  /* number of parallel prefetches */
1049   2,                                    /* Branch cost */
1050   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1051   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1052   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1053   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1054   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1055   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1056   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1057   bdver2_memcpy,
1058   bdver2_memset,
1059   6,                                    /* scalar_stmt_cost.  */
1060   4,                                    /* scalar load_cost.  */
1061   4,                                    /* scalar_store_cost.  */
1062   6,                                    /* vec_stmt_cost.  */
1063   0,                                    /* vec_to_scalar_cost.  */
1064   2,                                    /* scalar_to_vec_cost.  */
1065   4,                                    /* vec_align_load_cost.  */
1066   4,                                    /* vec_unalign_load_cost.  */
1067   4,                                    /* vec_store_cost.  */
1068   4,                                    /* cond_taken_branch_cost.  */
1069   2,                                    /* cond_not_taken_branch_cost.  */
1070 };
1071
1072
1073   /*  BDVER3 has optimized REP instruction for medium sized blocks, but for
1074       very small blocks it is better to use loop. For large blocks, libcall
1075       can do nontemporary accesses and beat inline considerably.  */
1076 static stringop_algs bdver3_memcpy[2] = {
1077   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1078              {-1, rep_prefix_4_byte, false}}},
1079   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1080              {-1, libcall, false}}}};
1081 static stringop_algs bdver3_memset[2] = {
1082   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1083              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1084   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1085              {-1, libcall, false}}}};
1086 struct processor_costs bdver3_cost = {
1087   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1088   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1089   COSTS_N_INSNS (1),                    /* variable shift costs */
1090   COSTS_N_INSNS (1),                    /* constant shift costs */
1091   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1092    COSTS_N_INSNS (4),                   /*                               HI */
1093    COSTS_N_INSNS (4),                   /*                               SI */
1094    COSTS_N_INSNS (6),                   /*                               DI */
1095    COSTS_N_INSNS (6)},                  /*                            other */
1096   0,                                    /* cost of multiply per each bit set */
1097   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1098    COSTS_N_INSNS (35),                  /*                          HI */
1099    COSTS_N_INSNS (51),                  /*                          SI */
1100    COSTS_N_INSNS (83),                  /*                          DI */
1101    COSTS_N_INSNS (83)},                 /*                          other */
1102   COSTS_N_INSNS (1),                    /* cost of movsx */
1103   COSTS_N_INSNS (1),                    /* cost of movzx */
1104   8,                                    /* "large" insn */
1105   9,                                    /* MOVE_RATIO */
1106   4,                                 /* cost for loading QImode using movzbl */
1107   {5, 5, 4},                            /* cost of loading integer registers
1108                                            in QImode, HImode and SImode.
1109                                            Relative to reg-reg move (2).  */
1110   {4, 4, 4},                            /* cost of storing integer registers */
1111   2,                                    /* cost of reg,reg fld/fst */
1112   {5, 5, 12},                           /* cost of loading fp registers
1113                                            in SFmode, DFmode and XFmode */
1114   {4, 4, 8},                            /* cost of storing fp registers
1115                                            in SFmode, DFmode and XFmode */
1116   2,                                    /* cost of moving MMX register */
1117   {4, 4},                               /* cost of loading MMX registers
1118                                            in SImode and DImode */
1119   {4, 4},                               /* cost of storing MMX registers
1120                                            in SImode and DImode */
1121   2,                                    /* cost of moving SSE register */
1122   {4, 4, 4},                            /* cost of loading SSE registers
1123                                            in SImode, DImode and TImode */
1124   {4, 4, 4},                            /* cost of storing SSE registers
1125                                            in SImode, DImode and TImode */
1126   2,                                    /* MMX or SSE register to integer */
1127   16,                                   /* size of l1 cache.  */
1128   2048,                                 /* size of l2 cache.  */
1129   64,                                   /* size of prefetch block */
1130   /* New AMD processors never drop prefetches; if they cannot be performed
1131      immediately, they are queued.  We set number of simultaneous prefetches
1132      to a large constant to reflect this (it probably is not a good idea not
1133      to limit number of prefetches at all, as their execution also takes some
1134      time).  */
1135   100,                                  /* number of parallel prefetches */
1136   2,                                    /* Branch cost */
1137   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1138   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1139   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1140   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1141   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1142   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1143   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1144   bdver3_memcpy,
1145   bdver3_memset,
1146   6,                                    /* scalar_stmt_cost.  */
1147   4,                                    /* scalar load_cost.  */
1148   4,                                    /* scalar_store_cost.  */
1149   6,                                    /* vec_stmt_cost.  */
1150   0,                                    /* vec_to_scalar_cost.  */
1151   2,                                    /* scalar_to_vec_cost.  */
1152   4,                                    /* vec_align_load_cost.  */
1153   4,                                    /* vec_unalign_load_cost.  */
1154   4,                                    /* vec_store_cost.  */
1155   4,                                    /* cond_taken_branch_cost.  */
1156   2,                                    /* cond_not_taken_branch_cost.  */
1157 };
1158
1159 /*  BDVER4 has optimized REP instruction for medium sized blocks, but for
1160     very small blocks it is better to use loop. For large blocks, libcall
1161     can do nontemporary accesses and beat inline considerably.  */
1162 static stringop_algs bdver4_memcpy[2] = {
1163   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1164              {-1, rep_prefix_4_byte, false}}},
1165   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1166              {-1, libcall, false}}}};
1167 static stringop_algs bdver4_memset[2] = {
1168   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1169              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1170   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1171              {-1, libcall, false}}}};
1172 struct processor_costs bdver4_cost = {
1173   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1174   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1175   COSTS_N_INSNS (1),                    /* variable shift costs */
1176   COSTS_N_INSNS (1),                    /* constant shift costs */
1177   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1178    COSTS_N_INSNS (4),                   /*                               HI */
1179    COSTS_N_INSNS (4),                   /*                               SI */
1180    COSTS_N_INSNS (6),                   /*                               DI */
1181    COSTS_N_INSNS (6)},                  /*                            other */
1182   0,                                    /* cost of multiply per each bit set */
1183   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1184    COSTS_N_INSNS (35),                  /*                          HI */
1185    COSTS_N_INSNS (51),                  /*                          SI */
1186    COSTS_N_INSNS (83),                  /*                          DI */
1187    COSTS_N_INSNS (83)},                 /*                          other */
1188   COSTS_N_INSNS (1),                    /* cost of movsx */
1189   COSTS_N_INSNS (1),                    /* cost of movzx */
1190   8,                                    /* "large" insn */
1191   9,                                    /* MOVE_RATIO */
1192   4,                                 /* cost for loading QImode using movzbl */
1193   {5, 5, 4},                            /* cost of loading integer registers
1194                                            in QImode, HImode and SImode.
1195                                            Relative to reg-reg move (2).  */
1196   {4, 4, 4},                            /* cost of storing integer registers */
1197   2,                                    /* cost of reg,reg fld/fst */
1198   {5, 5, 12},                           /* cost of loading fp registers
1199                                            in SFmode, DFmode and XFmode */
1200   {4, 4, 8},                            /* cost of storing fp registers
1201                                            in SFmode, DFmode and XFmode */
1202   2,                                    /* cost of moving MMX register */
1203   {4, 4},                               /* cost of loading MMX registers
1204                                            in SImode and DImode */
1205   {4, 4},                               /* cost of storing MMX registers
1206                                            in SImode and DImode */
1207   2,                                    /* cost of moving SSE register */
1208   {4, 4, 4},                            /* cost of loading SSE registers
1209                                            in SImode, DImode and TImode */
1210   {4, 4, 4},                            /* cost of storing SSE registers
1211                                            in SImode, DImode and TImode */
1212   2,                                    /* MMX or SSE register to integer */
1213   16,                                   /* size of l1 cache.  */
1214   2048,                                 /* size of l2 cache.  */
1215   64,                                   /* size of prefetch block */
1216   /* New AMD processors never drop prefetches; if they cannot be performed
1217      immediately, they are queued.  We set number of simultaneous prefetches
1218      to a large constant to reflect this (it probably is not a good idea not
1219      to limit number of prefetches at all, as their execution also takes some
1220      time).  */
1221   100,                                  /* number of parallel prefetches */
1222   2,                                    /* Branch cost */
1223   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1224   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1225   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1226   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1227   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1228   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1229   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1230   bdver4_memcpy,
1231   bdver4_memset,
1232   6,                                    /* scalar_stmt_cost.  */
1233   4,                                    /* scalar load_cost.  */
1234   4,                                    /* scalar_store_cost.  */
1235   6,                                    /* vec_stmt_cost.  */
1236   0,                                    /* vec_to_scalar_cost.  */
1237   2,                                    /* scalar_to_vec_cost.  */
1238   4,                                    /* vec_align_load_cost.  */
1239   4,                                    /* vec_unalign_load_cost.  */
1240   4,                                    /* vec_store_cost.  */
1241   4,                                    /* cond_taken_branch_cost.  */
1242   2,                                    /* cond_not_taken_branch_cost.  */
1243 };
1244
1245
1246 /*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
1247     very small blocks it is better to use loop.  For large blocks, libcall
1248     can do nontemporary accesses and beat inline considerably.  */
1249 static stringop_algs znver1_memcpy[2] = {
1250   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1251              {-1, rep_prefix_4_byte, false}}},
1252   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1253              {-1, libcall, false}}}};
1254 static stringop_algs znver1_memset[2] = {
1255   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1256              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1257   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1258              {-1, libcall, false}}}};
1259 struct processor_costs znver1_cost = {
1260   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1261   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1262   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1263   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1264   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1265    COSTS_N_INSNS (3),                   /*                               HI.  */
1266    COSTS_N_INSNS (3),                   /*                               SI.  */
1267    COSTS_N_INSNS (4),                   /*                               DI.  */
1268    COSTS_N_INSNS (4)},                  /*                            other.  */
1269   0,                                    /* cost of multiply per each bit
1270                                             set.  */
1271   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI.  */
1272    COSTS_N_INSNS (35),                  /*                          HI.  */
1273    COSTS_N_INSNS (51),                  /*                          SI.  */
1274    COSTS_N_INSNS (83),                  /*                          DI.  */
1275    COSTS_N_INSNS (83)},                 /*                          other.  */
1276   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1277   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1278   8,                                    /* "large" insn.  */
1279   9,                                    /* MOVE_RATIO.  */
1280   4,                                    /* cost for loading QImode using
1281                                            movzbl.  */
1282   {5, 5, 4},                            /* cost of loading integer registers
1283                                            in QImode, HImode and SImode.
1284                                            Relative to reg-reg move (2).  */
1285   {4, 4, 4},                            /* cost of storing integer
1286                                            registers.  */
1287   2,                                    /* cost of reg,reg fld/fst.  */
1288   {5, 5, 12},                           /* cost of loading fp registers
1289                                            in SFmode, DFmode and XFmode.  */
1290   {4, 4, 8},                            /* cost of storing fp registers
1291                                            in SFmode, DFmode and XFmode.  */
1292   2,                                    /* cost of moving MMX register.  */
1293   {4, 4},                               /* cost of loading MMX registers
1294                                            in SImode and DImode.  */
1295   {4, 4},                               /* cost of storing MMX registers
1296                                            in SImode and DImode.  */
1297   2,                                    /* cost of moving SSE register.  */
1298   {4, 4, 4},                            /* cost of loading SSE registers
1299                                            in SImode, DImode and TImode.  */
1300   {4, 4, 4},                            /* cost of storing SSE registers
1301                                            in SImode, DImode and TImode.  */
1302   2,                                    /* MMX or SSE register to integer.  */
1303   32,                                   /* size of l1 cache.  */
1304   512,                                  /* size of l2 cache.  */
1305   64,                                   /* size of prefetch block.  */
1306   /* New AMD processors never drop prefetches; if they cannot be performed
1307      immediately, they are queued.  We set number of simultaneous prefetches
1308      to a large constant to reflect this (it probably is not a good idea not
1309      to limit number of prefetches at all, as their execution also takes some
1310      time).  */
1311   100,                                  /* number of parallel prefetches.  */
1312   3,                                    /* Branch cost.  */
1313   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1314   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1315   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1316   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1317   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1318   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1319   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1320      and it can execute 2 integer additions and 2 multiplications thus
1321      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
1322      that 4 works better than 6 probably due to register pressure.
1323
1324      Integer vector operations are taken by FP unit and execute 3 vector
1325      plus/minus operations per cycle but only one multiply.  This is adjusted
1326      in ix86_reassociation_width.  */
1327   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1328   znver1_memcpy,
1329   znver1_memset,
1330   6,                                    /* scalar_stmt_cost.  */
1331   4,                                    /* scalar load_cost.  */
1332   4,                                    /* scalar_store_cost.  */
1333   6,                                    /* vec_stmt_cost.  */
1334   0,                                    /* vec_to_scalar_cost.  */
1335   2,                                    /* scalar_to_vec_cost.  */
1336   4,                                    /* vec_align_load_cost.  */
1337   4,                                    /* vec_unalign_load_cost.  */
1338   4,                                    /* vec_store_cost.  */
1339   4,                                    /* cond_taken_branch_cost.  */
1340   2,                                    /* cond_not_taken_branch_cost.  */
1341 };
1342
1343   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1344      very small blocks it is better to use loop. For large blocks, libcall can
1345      do nontemporary accesses and beat inline considerably.  */
1346 static stringop_algs btver1_memcpy[2] = {
1347   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1348              {-1, rep_prefix_4_byte, false}}},
1349   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1350              {-1, libcall, false}}}};
1351 static stringop_algs btver1_memset[2] = {
1352   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1353              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1354   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1355              {-1, libcall, false}}}};
1356 const struct processor_costs btver1_cost = {
1357   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1358   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1359   COSTS_N_INSNS (1),                    /* variable shift costs */
1360   COSTS_N_INSNS (1),                    /* constant shift costs */
1361   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1362    COSTS_N_INSNS (4),                   /*                               HI */
1363    COSTS_N_INSNS (3),                   /*                               SI */
1364    COSTS_N_INSNS (4),                   /*                               DI */
1365    COSTS_N_INSNS (5)},                  /*                            other */
1366   0,                                    /* cost of multiply per each bit set */
1367   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1368    COSTS_N_INSNS (35),                  /*                          HI */
1369    COSTS_N_INSNS (51),                  /*                          SI */
1370    COSTS_N_INSNS (83),                  /*                          DI */
1371    COSTS_N_INSNS (83)},                 /*                          other */
1372   COSTS_N_INSNS (1),                    /* cost of movsx */
1373   COSTS_N_INSNS (1),                    /* cost of movzx */
1374   8,                                    /* "large" insn */
1375   9,                                    /* MOVE_RATIO */
1376   4,                                 /* cost for loading QImode using movzbl */
1377   {3, 4, 3},                            /* cost of loading integer registers
1378                                            in QImode, HImode and SImode.
1379                                            Relative to reg-reg move (2).  */
1380   {3, 4, 3},                            /* cost of storing integer registers */
1381   4,                                    /* cost of reg,reg fld/fst */
1382   {4, 4, 12},                           /* cost of loading fp registers
1383                                            in SFmode, DFmode and XFmode */
1384   {6, 6, 8},                            /* cost of storing fp registers
1385                                            in SFmode, DFmode and XFmode */
1386   2,                                    /* cost of moving MMX register */
1387   {3, 3},                               /* cost of loading MMX registers
1388                                            in SImode and DImode */
1389   {4, 4},                               /* cost of storing MMX registers
1390                                            in SImode and DImode */
1391   2,                                    /* cost of moving SSE register */
1392   {4, 4, 3},                            /* cost of loading SSE registers
1393                                            in SImode, DImode and TImode */
1394   {4, 4, 5},                            /* cost of storing SSE registers
1395                                            in SImode, DImode and TImode */
1396   3,                                    /* MMX or SSE register to integer */
1397                                         /* On K8:
1398                                            MOVD reg64, xmmreg Double FSTORE 4
1399                                            MOVD reg32, xmmreg Double FSTORE 4
1400                                            On AMDFAM10:
1401                                            MOVD reg64, xmmreg Double FADD 3
1402                                                                1/1  1/1
1403                                             MOVD reg32, xmmreg Double FADD 3
1404                                                                1/1  1/1 */
1405   32,                                   /* size of l1 cache.  */
1406   512,                                  /* size of l2 cache.  */
1407   64,                                   /* size of prefetch block */
1408   100,                                  /* number of parallel prefetches */
1409   2,                                    /* Branch cost */
1410   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1411   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1412   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1413   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1414   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1415   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1416   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1417   btver1_memcpy,
1418   btver1_memset,
1419   4,                                    /* scalar_stmt_cost.  */
1420   2,                                    /* scalar load_cost.  */
1421   2,                                    /* scalar_store_cost.  */
1422   6,                                    /* vec_stmt_cost.  */
1423   0,                                    /* vec_to_scalar_cost.  */
1424   2,                                    /* scalar_to_vec_cost.  */
1425   2,                                    /* vec_align_load_cost.  */
1426   2,                                    /* vec_unalign_load_cost.  */
1427   2,                                    /* vec_store_cost.  */
1428   2,                                    /* cond_taken_branch_cost.  */
1429   1,                                    /* cond_not_taken_branch_cost.  */
1430 };
1431
1432 static stringop_algs btver2_memcpy[2] = {
1433   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1434              {-1, rep_prefix_4_byte, false}}},
1435   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1436              {-1, libcall, false}}}};
1437 static stringop_algs btver2_memset[2] = {
1438   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1439              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1440   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1441              {-1, libcall, false}}}};
1442 const struct processor_costs btver2_cost = {
1443   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1444   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1445   COSTS_N_INSNS (1),                    /* variable shift costs */
1446   COSTS_N_INSNS (1),                    /* constant shift costs */
1447   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1448    COSTS_N_INSNS (4),                   /*                               HI */
1449    COSTS_N_INSNS (3),                   /*                               SI */
1450    COSTS_N_INSNS (4),                   /*                               DI */
1451    COSTS_N_INSNS (5)},                  /*                            other */
1452   0,                                    /* cost of multiply per each bit set */
1453   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1454    COSTS_N_INSNS (35),                  /*                          HI */
1455    COSTS_N_INSNS (51),                  /*                          SI */
1456    COSTS_N_INSNS (83),                  /*                          DI */
1457    COSTS_N_INSNS (83)},                 /*                          other */
1458   COSTS_N_INSNS (1),                    /* cost of movsx */
1459   COSTS_N_INSNS (1),                    /* cost of movzx */
1460   8,                                    /* "large" insn */
1461   9,                                    /* MOVE_RATIO */
1462   4,                                 /* cost for loading QImode using movzbl */
1463   {3, 4, 3},                            /* cost of loading integer registers
1464                                            in QImode, HImode and SImode.
1465                                            Relative to reg-reg move (2).  */
1466   {3, 4, 3},                            /* cost of storing integer registers */
1467   4,                                    /* cost of reg,reg fld/fst */
1468   {4, 4, 12},                           /* cost of loading fp registers
1469                                            in SFmode, DFmode and XFmode */
1470   {6, 6, 8},                            /* cost of storing fp registers
1471                                            in SFmode, DFmode and XFmode */
1472   2,                                    /* cost of moving MMX register */
1473   {3, 3},                               /* cost of loading MMX registers
1474                                            in SImode and DImode */
1475   {4, 4},                               /* cost of storing MMX registers
1476                                            in SImode and DImode */
1477   2,                                    /* cost of moving SSE register */
1478   {4, 4, 3},                            /* cost of loading SSE registers
1479                                            in SImode, DImode and TImode */
1480   {4, 4, 5},                            /* cost of storing SSE registers
1481                                            in SImode, DImode and TImode */
1482   3,                                    /* MMX or SSE register to integer */
1483                                         /* On K8:
1484                                            MOVD reg64, xmmreg Double FSTORE 4
1485                                            MOVD reg32, xmmreg Double FSTORE 4
1486                                            On AMDFAM10:
1487                                            MOVD reg64, xmmreg Double FADD 3
1488                                                                1/1  1/1
1489                                             MOVD reg32, xmmreg Double FADD 3
1490                                                                1/1  1/1 */
1491   32,                                   /* size of l1 cache.  */
1492   2048,                                 /* size of l2 cache.  */
1493   64,                                   /* size of prefetch block */
1494   100,                                  /* number of parallel prefetches */
1495   2,                                    /* Branch cost */
1496   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1497   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1498   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1499   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1500   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1501   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1502   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1503   btver2_memcpy,
1504   btver2_memset,
1505   4,                                    /* scalar_stmt_cost.  */
1506   2,                                    /* scalar load_cost.  */
1507   2,                                    /* scalar_store_cost.  */
1508   6,                                    /* vec_stmt_cost.  */
1509   0,                                    /* vec_to_scalar_cost.  */
1510   2,                                    /* scalar_to_vec_cost.  */
1511   2,                                    /* vec_align_load_cost.  */
1512   2,                                    /* vec_unalign_load_cost.  */
1513   2,                                    /* vec_store_cost.  */
1514   2,                                    /* cond_taken_branch_cost.  */
1515   1,                                    /* cond_not_taken_branch_cost.  */
1516 };
1517
1518 static stringop_algs pentium4_memcpy[2] = {
1519   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1520   DUMMY_STRINGOP_ALGS};
1521 static stringop_algs pentium4_memset[2] = {
1522   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1523              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1524   DUMMY_STRINGOP_ALGS};
1525
1526 static const
1527 struct processor_costs pentium4_cost = {
1528   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1529   COSTS_N_INSNS (3),                    /* cost of a lea instruction */
1530   COSTS_N_INSNS (4),                    /* variable shift costs */
1531   COSTS_N_INSNS (4),                    /* constant shift costs */
1532   {COSTS_N_INSNS (15),                  /* cost of starting multiply for QI */
1533    COSTS_N_INSNS (15),                  /*                               HI */
1534    COSTS_N_INSNS (15),                  /*                               SI */
1535    COSTS_N_INSNS (15),                  /*                               DI */
1536    COSTS_N_INSNS (15)},                 /*                            other */
1537   0,                                    /* cost of multiply per each bit set */
1538   {COSTS_N_INSNS (56),                  /* cost of a divide/mod for QI */
1539    COSTS_N_INSNS (56),                  /*                          HI */
1540    COSTS_N_INSNS (56),                  /*                          SI */
1541    COSTS_N_INSNS (56),                  /*                          DI */
1542    COSTS_N_INSNS (56)},                 /*                          other */
1543   COSTS_N_INSNS (1),                    /* cost of movsx */
1544   COSTS_N_INSNS (1),                    /* cost of movzx */
1545   16,                                   /* "large" insn */
1546   6,                                    /* MOVE_RATIO */
1547   2,                                 /* cost for loading QImode using movzbl */
1548   {4, 5, 4},                            /* cost of loading integer registers
1549                                            in QImode, HImode and SImode.
1550                                            Relative to reg-reg move (2).  */
1551   {2, 3, 2},                            /* cost of storing integer registers */
1552   2,                                    /* cost of reg,reg fld/fst */
1553   {2, 2, 6},                            /* cost of loading fp registers
1554                                            in SFmode, DFmode and XFmode */
1555   {4, 4, 6},                            /* cost of storing fp registers
1556                                            in SFmode, DFmode and XFmode */
1557   2,                                    /* cost of moving MMX register */
1558   {2, 2},                               /* cost of loading MMX registers
1559                                            in SImode and DImode */
1560   {2, 2},                               /* cost of storing MMX registers
1561                                            in SImode and DImode */
1562   12,                                   /* cost of moving SSE register */
1563   {12, 12, 12},                         /* cost of loading SSE registers
1564                                            in SImode, DImode and TImode */
1565   {2, 2, 8},                            /* cost of storing SSE registers
1566                                            in SImode, DImode and TImode */
1567   10,                                   /* MMX or SSE register to integer */
1568   8,                                    /* size of l1 cache.  */
1569   256,                                  /* size of l2 cache.  */
1570   64,                                   /* size of prefetch block */
1571   6,                                    /* number of parallel prefetches */
1572   2,                                    /* Branch cost */
1573   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1574   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
1575   COSTS_N_INSNS (43),                   /* cost of FDIV instruction.  */
1576   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1577   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1578   COSTS_N_INSNS (43),                   /* cost of FSQRT instruction.  */
1579   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1580   pentium4_memcpy,
1581   pentium4_memset,
1582   1,                                    /* scalar_stmt_cost.  */
1583   1,                                    /* scalar load_cost.  */
1584   1,                                    /* scalar_store_cost.  */
1585   1,                                    /* vec_stmt_cost.  */
1586   1,                                    /* vec_to_scalar_cost.  */
1587   1,                                    /* scalar_to_vec_cost.  */
1588   1,                                    /* vec_align_load_cost.  */
1589   2,                                    /* vec_unalign_load_cost.  */
1590   1,                                    /* vec_store_cost.  */
1591   3,                                    /* cond_taken_branch_cost.  */
1592   1,                                    /* cond_not_taken_branch_cost.  */
1593 };
1594
1595 static stringop_algs nocona_memcpy[2] = {
1596   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1597   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1598              {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1599
1600 static stringop_algs nocona_memset[2] = {
1601   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1602              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1603   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1604              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1605
1606 static const
1607 struct processor_costs nocona_cost = {
1608   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1609   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1610   COSTS_N_INSNS (1),                    /* variable shift costs */
1611   COSTS_N_INSNS (1),                    /* constant shift costs */
1612   {COSTS_N_INSNS (10),                  /* cost of starting multiply for QI */
1613    COSTS_N_INSNS (10),                  /*                               HI */
1614    COSTS_N_INSNS (10),                  /*                               SI */
1615    COSTS_N_INSNS (10),                  /*                               DI */
1616    COSTS_N_INSNS (10)},                 /*                            other */
1617   0,                                    /* cost of multiply per each bit set */
1618   {COSTS_N_INSNS (66),                  /* cost of a divide/mod for QI */
1619    COSTS_N_INSNS (66),                  /*                          HI */
1620    COSTS_N_INSNS (66),                  /*                          SI */
1621    COSTS_N_INSNS (66),                  /*                          DI */
1622    COSTS_N_INSNS (66)},                 /*                          other */
1623   COSTS_N_INSNS (1),                    /* cost of movsx */
1624   COSTS_N_INSNS (1),                    /* cost of movzx */
1625   16,                                   /* "large" insn */
1626   17,                                   /* MOVE_RATIO */
1627   4,                                 /* cost for loading QImode using movzbl */
1628   {4, 4, 4},                            /* cost of loading integer registers
1629                                            in QImode, HImode and SImode.
1630                                            Relative to reg-reg move (2).  */
1631   {4, 4, 4},                            /* cost of storing integer registers */
1632   3,                                    /* cost of reg,reg fld/fst */
1633   {12, 12, 12},                         /* cost of loading fp registers
1634                                            in SFmode, DFmode and XFmode */
1635   {4, 4, 4},                            /* cost of storing fp registers
1636                                            in SFmode, DFmode and XFmode */
1637   6,                                    /* cost of moving MMX register */
1638   {12, 12},                             /* cost of loading MMX registers
1639                                            in SImode and DImode */
1640   {12, 12},                             /* cost of storing MMX registers
1641                                            in SImode and DImode */
1642   6,                                    /* cost of moving SSE register */
1643   {12, 12, 12},                         /* cost of loading SSE registers
1644                                            in SImode, DImode and TImode */
1645   {12, 12, 12},                         /* cost of storing SSE registers
1646                                            in SImode, DImode and TImode */
1647   8,                                    /* MMX or SSE register to integer */
1648   8,                                    /* size of l1 cache.  */
1649   1024,                                 /* size of l2 cache.  */
1650   64,                                   /* size of prefetch block */
1651   8,                                    /* number of parallel prefetches */
1652   1,                                    /* Branch cost */
1653   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1654   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
1655   COSTS_N_INSNS (40),                   /* cost of FDIV instruction.  */
1656   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
1657   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
1658   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
1659   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1660   nocona_memcpy,
1661   nocona_memset,
1662   1,                                    /* scalar_stmt_cost.  */
1663   1,                                    /* scalar load_cost.  */
1664   1,                                    /* scalar_store_cost.  */
1665   1,                                    /* vec_stmt_cost.  */
1666   1,                                    /* vec_to_scalar_cost.  */
1667   1,                                    /* scalar_to_vec_cost.  */
1668   1,                                    /* vec_align_load_cost.  */
1669   2,                                    /* vec_unalign_load_cost.  */
1670   1,                                    /* vec_store_cost.  */
1671   3,                                    /* cond_taken_branch_cost.  */
1672   1,                                    /* cond_not_taken_branch_cost.  */
1673 };
1674
1675 static stringop_algs atom_memcpy[2] = {
1676   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1677   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1678              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1679 static stringop_algs atom_memset[2] = {
1680   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1681              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1682   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1683              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1684 static const
1685 struct processor_costs atom_cost = {
1686   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1687   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
1688   COSTS_N_INSNS (1),                    /* variable shift costs */
1689   COSTS_N_INSNS (1),                    /* constant shift costs */
1690   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1691    COSTS_N_INSNS (4),                   /*                               HI */
1692    COSTS_N_INSNS (3),                   /*                               SI */
1693    COSTS_N_INSNS (4),                   /*                               DI */
1694    COSTS_N_INSNS (2)},                  /*                            other */
1695   0,                                    /* cost of multiply per each bit set */
1696   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
1697    COSTS_N_INSNS (26),                  /*                          HI */
1698    COSTS_N_INSNS (42),                  /*                          SI */
1699    COSTS_N_INSNS (74),                  /*                          DI */
1700    COSTS_N_INSNS (74)},                 /*                          other */
1701   COSTS_N_INSNS (1),                    /* cost of movsx */
1702   COSTS_N_INSNS (1),                    /* cost of movzx */
1703   8,                                    /* "large" insn */
1704   17,                                   /* MOVE_RATIO */
1705   4,                                    /* cost for loading QImode using movzbl */
1706   {4, 4, 4},                            /* cost of loading integer registers
1707                                            in QImode, HImode and SImode.
1708                                            Relative to reg-reg move (2).  */
1709   {4, 4, 4},                            /* cost of storing integer registers */
1710   4,                                    /* cost of reg,reg fld/fst */
1711   {12, 12, 12},                         /* cost of loading fp registers
1712                                            in SFmode, DFmode and XFmode */
1713   {6, 6, 8},                            /* cost of storing fp registers
1714                                            in SFmode, DFmode and XFmode */
1715   2,                                    /* cost of moving MMX register */
1716   {8, 8},                               /* cost of loading MMX registers
1717                                            in SImode and DImode */
1718   {8, 8},                               /* cost of storing MMX registers
1719                                            in SImode and DImode */
1720   2,                                    /* cost of moving SSE register */
1721   {8, 8, 8},                            /* cost of loading SSE registers
1722                                            in SImode, DImode and TImode */
1723   {8, 8, 8},                            /* cost of storing SSE registers
1724                                            in SImode, DImode and TImode */
1725   5,                                    /* MMX or SSE register to integer */
1726   32,                                   /* size of l1 cache.  */
1727   256,                                  /* size of l2 cache.  */
1728   64,                                   /* size of prefetch block */
1729   6,                                    /* number of parallel prefetches */
1730   3,                                    /* Branch cost */
1731   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
1732   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
1733   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
1734   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
1735   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
1736   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
1737   2, 2, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
1738   atom_memcpy,
1739   atom_memset,
1740   1,                                    /* scalar_stmt_cost.  */
1741   1,                                    /* scalar load_cost.  */
1742   1,                                    /* scalar_store_cost.  */
1743   1,                                    /* vec_stmt_cost.  */
1744   1,                                    /* vec_to_scalar_cost.  */
1745   1,                                    /* scalar_to_vec_cost.  */
1746   1,                                    /* vec_align_load_cost.  */
1747   2,                                    /* vec_unalign_load_cost.  */
1748   1,                                    /* vec_store_cost.  */
1749   3,                                    /* cond_taken_branch_cost.  */
1750   1,                                    /* cond_not_taken_branch_cost.  */
1751 };
1752
1753 static stringop_algs slm_memcpy[2] = {
1754   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1755   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1756              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1757 static stringop_algs slm_memset[2] = {
1758   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1759              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1760   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1761              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1762 static const
1763 struct processor_costs slm_cost = {
1764   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1765   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
1766   COSTS_N_INSNS (1),                    /* variable shift costs */
1767   COSTS_N_INSNS (1),                    /* constant shift costs */
1768   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1769    COSTS_N_INSNS (3),                   /*                               HI */
1770    COSTS_N_INSNS (3),                   /*                               SI */
1771    COSTS_N_INSNS (4),                   /*                               DI */
1772    COSTS_N_INSNS (2)},                  /*                            other */
1773   0,                                    /* cost of multiply per each bit set */
1774   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
1775    COSTS_N_INSNS (26),                  /*                          HI */
1776    COSTS_N_INSNS (42),                  /*                          SI */
1777    COSTS_N_INSNS (74),                  /*                          DI */
1778    COSTS_N_INSNS (74)},                 /*                          other */
1779   COSTS_N_INSNS (1),                    /* cost of movsx */
1780   COSTS_N_INSNS (1),                    /* cost of movzx */
1781   8,                                    /* "large" insn */
1782   17,                                   /* MOVE_RATIO */
1783   4,                                    /* cost for loading QImode using movzbl */
1784   {4, 4, 4},                            /* cost of loading integer registers
1785                                            in QImode, HImode and SImode.
1786                                            Relative to reg-reg move (2).  */
1787   {4, 4, 4},                            /* cost of storing integer registers */
1788   4,                                    /* cost of reg,reg fld/fst */
1789   {12, 12, 12},                         /* cost of loading fp registers
1790                                            in SFmode, DFmode and XFmode */
1791   {6, 6, 8},                            /* cost of storing fp registers
1792                                            in SFmode, DFmode and XFmode */
1793   2,                                    /* cost of moving MMX register */
1794   {8, 8},                               /* cost of loading MMX registers
1795                                            in SImode and DImode */
1796   {8, 8},                               /* cost of storing MMX registers
1797                                            in SImode and DImode */
1798   2,                                    /* cost of moving SSE register */
1799   {8, 8, 8},                            /* cost of loading SSE registers
1800                                            in SImode, DImode and TImode */
1801   {8, 8, 8},                            /* cost of storing SSE registers
1802                                            in SImode, DImode and TImode */
1803   5,                                    /* MMX or SSE register to integer */
1804   32,                                   /* size of l1 cache.  */
1805   256,                                  /* size of l2 cache.  */
1806   64,                                   /* size of prefetch block */
1807   6,                                    /* number of parallel prefetches */
1808   3,                                    /* Branch cost */
1809   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
1810   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
1811   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
1812   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
1813   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
1814   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
1815   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1816   slm_memcpy,
1817   slm_memset,
1818   1,                                    /* scalar_stmt_cost.  */
1819   1,                                    /* scalar load_cost.  */
1820   1,                                    /* scalar_store_cost.  */
1821   1,                                    /* vec_stmt_cost.  */
1822   4,                                    /* vec_to_scalar_cost.  */
1823   1,                                    /* scalar_to_vec_cost.  */
1824   1,                                    /* vec_align_load_cost.  */
1825   2,                                    /* vec_unalign_load_cost.  */
1826   1,                                    /* vec_store_cost.  */
1827   3,                                    /* cond_taken_branch_cost.  */
1828   1,                                    /* cond_not_taken_branch_cost.  */
1829 };
1830
1831 static stringop_algs intel_memcpy[2] = {
1832   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1833   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1834              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1835 static stringop_algs intel_memset[2] = {
1836   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1837              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1838   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1839              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1840 static const
1841 struct processor_costs intel_cost = {
1842   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1843   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
1844   COSTS_N_INSNS (1),                    /* variable shift costs */
1845   COSTS_N_INSNS (1),                    /* constant shift costs */
1846   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1847    COSTS_N_INSNS (3),                   /*                               HI */
1848    COSTS_N_INSNS (3),                   /*                               SI */
1849    COSTS_N_INSNS (4),                   /*                               DI */
1850    COSTS_N_INSNS (2)},                  /*                            other */
1851   0,                                    /* cost of multiply per each bit set */
1852   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
1853    COSTS_N_INSNS (26),                  /*                          HI */
1854    COSTS_N_INSNS (42),                  /*                          SI */
1855    COSTS_N_INSNS (74),                  /*                          DI */
1856    COSTS_N_INSNS (74)},                 /*                          other */
1857   COSTS_N_INSNS (1),                    /* cost of movsx */
1858   COSTS_N_INSNS (1),                    /* cost of movzx */
1859   8,                                    /* "large" insn */
1860   17,                                   /* MOVE_RATIO */
1861   4,                                    /* cost for loading QImode using movzbl */
1862   {4, 4, 4},                            /* cost of loading integer registers
1863                                            in QImode, HImode and SImode.
1864                                            Relative to reg-reg move (2).  */
1865   {4, 4, 4},                            /* cost of storing integer registers */
1866   4,                                    /* cost of reg,reg fld/fst */
1867   {12, 12, 12},                         /* cost of loading fp registers
1868                                            in SFmode, DFmode and XFmode */
1869   {6, 6, 8},                            /* cost of storing fp registers
1870                                            in SFmode, DFmode and XFmode */
1871   2,                                    /* cost of moving MMX register */
1872   {8, 8},                               /* cost of loading MMX registers
1873                                            in SImode and DImode */
1874   {8, 8},                               /* cost of storing MMX registers
1875                                            in SImode and DImode */
1876   2,                                    /* cost of moving SSE register */
1877   {8, 8, 8},                            /* cost of loading SSE registers
1878                                            in SImode, DImode and TImode */
1879   {8, 8, 8},                            /* cost of storing SSE registers
1880                                            in SImode, DImode and TImode */
1881   5,                                    /* MMX or SSE register to integer */
1882   32,                                   /* size of l1 cache.  */
1883   256,                                  /* size of l2 cache.  */
1884   64,                                   /* size of prefetch block */
1885   6,                                    /* number of parallel prefetches */
1886   3,                                    /* Branch cost */
1887   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
1888   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
1889   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
1890   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
1891   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
1892   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
1893   1, 4, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1894   intel_memcpy,
1895   intel_memset,
1896   1,                                    /* scalar_stmt_cost.  */
1897   1,                                    /* scalar load_cost.  */
1898   1,                                    /* scalar_store_cost.  */
1899   1,                                    /* vec_stmt_cost.  */
1900   4,                                    /* vec_to_scalar_cost.  */
1901   1,                                    /* scalar_to_vec_cost.  */
1902   1,                                    /* vec_align_load_cost.  */
1903   2,                                    /* vec_unalign_load_cost.  */
1904   1,                                    /* vec_store_cost.  */
1905   3,                                    /* cond_taken_branch_cost.  */
1906   1,                                    /* cond_not_taken_branch_cost.  */
1907 };
1908
1909 /* Generic should produce code tuned for Core-i7 (and newer chips)
1910    and btver1 (and newer chips).  */
1911
1912 static stringop_algs generic_memcpy[2] = {
1913   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1914              {-1, libcall, false}}},
1915   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1916              {-1, libcall, false}}}};
1917 static stringop_algs generic_memset[2] = {
1918   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1919              {-1, libcall, false}}},
1920   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1921              {-1, libcall, false}}}};
1922 static const
1923 struct processor_costs generic_cost = {
1924   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1925   /* On all chips taken into consideration lea is 2 cycles and more.  With
1926      this cost however our current implementation of synth_mult results in
1927      use of unnecessary temporary registers causing regression on several
1928      SPECfp benchmarks.  */
1929   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
1930   COSTS_N_INSNS (1),                    /* variable shift costs */
1931   COSTS_N_INSNS (1),                    /* constant shift costs */
1932   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1933    COSTS_N_INSNS (4),                   /*                               HI */
1934    COSTS_N_INSNS (3),                   /*                               SI */
1935    COSTS_N_INSNS (4),                   /*                               DI */
1936    COSTS_N_INSNS (2)},                  /*                            other */
1937   0,                                    /* cost of multiply per each bit set */
1938   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
1939    COSTS_N_INSNS (26),                  /*                          HI */
1940    COSTS_N_INSNS (42),                  /*                          SI */
1941    COSTS_N_INSNS (74),                  /*                          DI */
1942    COSTS_N_INSNS (74)},                 /*                          other */
1943   COSTS_N_INSNS (1),                    /* cost of movsx */
1944   COSTS_N_INSNS (1),                    /* cost of movzx */
1945   8,                                    /* "large" insn */
1946   17,                                   /* MOVE_RATIO */
1947   4,                                 /* cost for loading QImode using movzbl */
1948   {4, 4, 4},                            /* cost of loading integer registers
1949                                            in QImode, HImode and SImode.
1950                                            Relative to reg-reg move (2).  */
1951   {4, 4, 4},                            /* cost of storing integer registers */
1952   4,                                    /* cost of reg,reg fld/fst */
1953   {12, 12, 12},                         /* cost of loading fp registers
1954                                            in SFmode, DFmode and XFmode */
1955   {6, 6, 8},                            /* cost of storing fp registers
1956                                            in SFmode, DFmode and XFmode */
1957   2,                                    /* cost of moving MMX register */
1958   {8, 8},                               /* cost of loading MMX registers
1959                                            in SImode and DImode */
1960   {8, 8},                               /* cost of storing MMX registers
1961                                            in SImode and DImode */
1962   2,                                    /* cost of moving SSE register */
1963   {8, 8, 8},                            /* cost of loading SSE registers
1964                                            in SImode, DImode and TImode */
1965   {8, 8, 8},                            /* cost of storing SSE registers
1966                                            in SImode, DImode and TImode */
1967   5,                                    /* MMX or SSE register to integer */
1968   32,                                   /* size of l1 cache.  */
1969   512,                                  /* size of l2 cache.  */
1970   64,                                   /* size of prefetch block */
1971   6,                                    /* number of parallel prefetches */
1972   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1973      value is increased to perhaps more appropriate value of 5.  */
1974   3,                                    /* Branch cost */
1975   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
1976   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
1977   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
1978   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
1979   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
1980   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
1981   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1982   generic_memcpy,
1983   generic_memset,
1984   1,                                    /* scalar_stmt_cost.  */
1985   1,                                    /* scalar load_cost.  */
1986   1,                                    /* scalar_store_cost.  */
1987   1,                                    /* vec_stmt_cost.  */
1988   1,                                    /* vec_to_scalar_cost.  */
1989   1,                                    /* scalar_to_vec_cost.  */
1990   1,                                    /* vec_align_load_cost.  */
1991   2,                                    /* vec_unalign_load_cost.  */
1992   1,                                    /* vec_store_cost.  */
1993   3,                                    /* cond_taken_branch_cost.  */
1994   1,                                    /* cond_not_taken_branch_cost.  */
1995 };
1996
1997 /* core_cost should produce code tuned for Core familly of CPUs.  */
1998 static stringop_algs core_memcpy[2] = {
1999   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2000   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2001              {-1, libcall, false}}}};
2002 static stringop_algs core_memset[2] = {
2003   {libcall, {{6, loop_1_byte, true},
2004              {24, loop, true},
2005              {8192, rep_prefix_4_byte, true},
2006              {-1, libcall, false}}},
2007   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2008              {-1, libcall, false}}}};
2009
2010 static const
2011 struct processor_costs core_cost = {
2012   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2013   /* On all chips taken into consideration lea is 2 cycles and more.  With
2014      this cost however our current implementation of synth_mult results in
2015      use of unnecessary temporary registers causing regression on several
2016      SPECfp benchmarks.  */
2017   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2018   COSTS_N_INSNS (1),                    /* variable shift costs */
2019   COSTS_N_INSNS (1),                    /* constant shift costs */
2020   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2021    COSTS_N_INSNS (4),                   /*                               HI */
2022    COSTS_N_INSNS (3),                   /*                               SI */
2023    COSTS_N_INSNS (4),                   /*                               DI */
2024    COSTS_N_INSNS (2)},                  /*                            other */
2025   0,                                    /* cost of multiply per each bit set */
2026   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
2027    COSTS_N_INSNS (26),                  /*                          HI */
2028    COSTS_N_INSNS (42),                  /*                          SI */
2029    COSTS_N_INSNS (74),                  /*                          DI */
2030    COSTS_N_INSNS (74)},                 /*                          other */
2031   COSTS_N_INSNS (1),                    /* cost of movsx */
2032   COSTS_N_INSNS (1),                    /* cost of movzx */
2033   8,                                    /* "large" insn */
2034   17,                                   /* MOVE_RATIO */
2035   4,                                 /* cost for loading QImode using movzbl */
2036   {4, 4, 4},                            /* cost of loading integer registers
2037                                            in QImode, HImode and SImode.
2038                                            Relative to reg-reg move (2).  */
2039   {4, 4, 4},                            /* cost of storing integer registers */
2040   4,                                    /* cost of reg,reg fld/fst */
2041   {12, 12, 12},                         /* cost of loading fp registers
2042                                            in SFmode, DFmode and XFmode */
2043   {6, 6, 8},                            /* cost of storing fp registers
2044                                            in SFmode, DFmode and XFmode */
2045   2,                                    /* cost of moving MMX register */
2046   {8, 8},                               /* cost of loading MMX registers
2047                                            in SImode and DImode */
2048   {8, 8},                               /* cost of storing MMX registers
2049                                            in SImode and DImode */
2050   2,                                    /* cost of moving SSE register */
2051   {8, 8, 8},                            /* cost of loading SSE registers
2052                                            in SImode, DImode and TImode */
2053   {8, 8, 8},                            /* cost of storing SSE registers
2054                                            in SImode, DImode and TImode */
2055   5,                                    /* MMX or SSE register to integer */
2056   64,                                   /* size of l1 cache.  */
2057   512,                                  /* size of l2 cache.  */
2058   64,                                   /* size of prefetch block */
2059   6,                                    /* number of parallel prefetches */
2060   /* FIXME perhaps more appropriate value is 5.  */
2061   3,                                    /* Branch cost */
2062   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2063   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2064   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2065   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2066   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2067   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2068   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
2069   core_memcpy,
2070   core_memset,
2071   1,                                    /* scalar_stmt_cost.  */
2072   1,                                    /* scalar load_cost.  */
2073   1,                                    /* scalar_store_cost.  */
2074   1,                                    /* vec_stmt_cost.  */
2075   1,                                    /* vec_to_scalar_cost.  */
2076   1,                                    /* scalar_to_vec_cost.  */
2077   1,                                    /* vec_align_load_cost.  */
2078   2,                                    /* vec_unalign_load_cost.  */
2079   1,                                    /* vec_store_cost.  */
2080   3,                                    /* cond_taken_branch_cost.  */
2081   1,                                    /* cond_not_taken_branch_cost.  */
2082 };
2083