sysdeps/x86/dl-cacheinfo.h

   1 /* Initialize x86 cache info.
   2    Copyright (C) 2020-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 static const struct intel_02_cache_info
  20 {
  21   unsigned char idx;
  22   unsigned char assoc;
  23   unsigned char linesize;
  24   unsigned char rel_name;
  25   unsigned int size;
  26 } intel_02_known [] =
  27   {
  28 #define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
  29     { 0x06,  4, 32, M(_SC_LEVEL1_ICACHE_SIZE),    8192 },
  30     { 0x08,  4, 32, M(_SC_LEVEL1_ICACHE_SIZE),   16384 },
  31     { 0x09,  4, 32, M(_SC_LEVEL1_ICACHE_SIZE),   32768 },
  32     { 0x0a,  2, 32, M(_SC_LEVEL1_DCACHE_SIZE),    8192 },
  33     { 0x0c,  4, 32, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
  34     { 0x0d,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
  35     { 0x0e,  6, 64, M(_SC_LEVEL1_DCACHE_SIZE),   24576 },
  36     { 0x21,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  37     { 0x22,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),   524288 },
  38     { 0x23,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
  39     { 0x25,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
  40     { 0x29,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
  41     { 0x2c,  8, 64, M(_SC_LEVEL1_DCACHE_SIZE),   32768 },
  42     { 0x30,  8, 64, M(_SC_LEVEL1_ICACHE_SIZE),   32768 },
  43     { 0x39,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
  44     { 0x3a,  6, 64, M(_SC_LEVEL2_CACHE_SIZE),   196608 },
  45     { 0x3b,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
  46     { 0x3c,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  47     { 0x3d,  6, 64, M(_SC_LEVEL2_CACHE_SIZE),   393216 },
  48     { 0x3e,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  49     { 0x3f,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  50     { 0x41,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
  51     { 0x42,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  52     { 0x43,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  53     { 0x44,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
  54     { 0x45,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
  55     { 0x46,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
  56     { 0x47,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
  57     { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE),  3145728 },
  58     { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE),  4194304 },
  59     { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  6291456 },
  60     { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
  61     { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
  62     { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 16777216 },
  63     { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE),  6291456 },
  64     { 0x60,  8, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
  65     { 0x66,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),    8192 },
  66     { 0x67,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
  67     { 0x68,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   32768 },
  68     { 0x78,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
  69     { 0x79,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
  70     { 0x7a,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  71     { 0x7b,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  72     { 0x7c,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
  73     { 0x7d,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
  74     { 0x7f,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  75     { 0x80,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  76     { 0x82,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  77     { 0x83,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  78     { 0x84,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
  79     { 0x85,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
  80     { 0x86,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  81     { 0x87,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
  82     { 0xd0,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),   524288 },
  83     { 0xd1,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
  84     { 0xd2,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
  85     { 0xd6,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
  86     { 0xd7,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
  87     { 0xd8,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
  88     { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
  89     { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
  90     { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
  91     { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
  92     { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
  93     { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
  94     { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
  95     { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 },
  96     { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 },
  97   };
  98
  99 #define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
 100
 101 static int
 102 intel_02_known_compare (const void *p1, const void *p2)
 103 {
 104   const struct intel_02_cache_info *i1;
 105   const struct intel_02_cache_info *i2;
 106
 107   i1 = (const struct intel_02_cache_info *) p1;
 108   i2 = (const struct intel_02_cache_info *) p2;
 109
 110   if (i1->idx == i2->idx)
 111     return 0;
 112
 113   return i1->idx < i2->idx ? -1 : 1;
 114 }
 115
 116
 117 static long int
 118 __attribute__ ((noinline))
 119 intel_check_word (int name, unsigned int value, bool *has_level_2,
 120                   bool *no_level_2_or_3,
 121                   const struct cpu_features *cpu_features)
 122 {
 123   if ((value & 0x80000000) != 0)
 124     /* The register value is reserved.  */
 125     return 0;
 126
 127   /* Fold the name.  The _SC_ constants are always in the order SIZE,
 128      ASSOC, LINESIZE.  */
 129   int folded_rel_name = (M(name) / 3) * 3;
 130
 131   while (value != 0)
 132     {
 133       unsigned int byte = value & 0xff;
 134
 135       if (byte == 0x40)
 136         {
 137           *no_level_2_or_3 = true;
 138
 139           if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
 140             /* No need to look further.  */
 141             break;
 142         }
 143       else if (byte == 0xff)
 144         {
 145           /* CPUID leaf 0x4 contains all the information.  We need to
 146              iterate over it.  */
 147           unsigned int eax;
 148           unsigned int ebx;
 149           unsigned int ecx;
 150           unsigned int edx;
 151
 152           unsigned int round = 0;
 153           while (1)
 154             {
 155               __cpuid_count (4, round, eax, ebx, ecx, edx);
 156
 157               enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
 158               if (type == null)
 159                 /* That was the end.  */
 160                 break;
 161
 162               unsigned int level = (eax >> 5) & 0x7;
 163
 164               if ((level == 1 && type == data
 165                    && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
 166                   || (level == 1 && type == inst
 167                       && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
 168                   || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
 169                   || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
 170                   || (level == 4 && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
 171                 {
 172                   unsigned int offset = M(name) - folded_rel_name;
 173
 174                   if (offset == 0)
 175                     /* Cache size.  */
 176                     return (((ebx >> 22) + 1)
 177                             * (((ebx >> 12) & 0x3ff) + 1)
 178                             * ((ebx & 0xfff) + 1)
 179                             * (ecx + 1));
 180                   if (offset == 1)
 181                     return (ebx >> 22) + 1;
 182
 183                   assert (offset == 2);
 184                   return (ebx & 0xfff) + 1;
 185                 }
 186
 187               ++round;
 188             }
 189           /* There is no other cache information anywhere else.  */
 190           break;
 191         }
 192       else
 193         {
 194           if (byte == 0x49 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
 195             {
 196               /* Intel reused this value.  For family 15, model 6 it
 197                  specifies the 3rd level cache.  Otherwise the 2nd
 198                  level cache.  */
 199               unsigned int family = cpu_features->basic.family;
 200               unsigned int model = cpu_features->basic.model;
 201
 202               if (family == 15 && model == 6)
 203                 {
 204                   /* The level 3 cache is encoded for this model like
 205                      the level 2 cache is for other models.  Pretend
 206                      the caller asked for the level 2 cache.  */
 207                   name = (_SC_LEVEL2_CACHE_SIZE
 208                           + (name - _SC_LEVEL3_CACHE_SIZE));
 209                   folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
 210                 }
 211             }
 212
 213           struct intel_02_cache_info *found;
 214           struct intel_02_cache_info search;
 215
 216           search.idx = byte;
 217           found = bsearch (&search, intel_02_known, nintel_02_known,
 218                            sizeof (intel_02_known[0]), intel_02_known_compare);
 219           if (found != NULL)
 220             {
 221               if (found->rel_name == folded_rel_name)
 222                 {
 223                   unsigned int offset = M(name) - folded_rel_name;
 224
 225                   if (offset == 0)
 226                     /* Cache size.  */
 227                     return found->size;
 228                   if (offset == 1)
 229                     return found->assoc;
 230
 231                   assert (offset == 2);
 232                   return found->linesize;
 233                 }
 234
 235               if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
 236                 *has_level_2 = true;
 237             }
 238         }
 239
 240       /* Next byte for the next round.  */
 241       value >>= 8;
 242     }
 243
 244   /* Nothing found.  */
 245   return 0;
 246 }
 247
 248
 249 static long int __attribute__ ((noinline))
 250 handle_intel (int name, const struct cpu_features *cpu_features)
 251 {
 252   unsigned int maxidx = cpu_features->basic.max_cpuid;
 253
 254   /* Return -1 for older CPUs.  */
 255   if (maxidx < 2)
 256     return -1;
 257
 258   /* OK, we can use the CPUID instruction to get all info about the
 259      caches.  */
 260   unsigned int cnt = 0;
 261   unsigned int max = 1;
 262   long int result = 0;
 263   bool no_level_2_or_3 = false;
 264   bool has_level_2 = false;
 265
 266   while (cnt++ < max)
 267     {
 268       unsigned int eax;
 269       unsigned int ebx;
 270       unsigned int ecx;
 271       unsigned int edx;
 272       __cpuid (2, eax, ebx, ecx, edx);
 273
 274       /* The low byte of EAX in the first round contain the number of
 275          rounds we have to make.  At least one, the one we are already
 276          doing.  */
 277       if (cnt == 1)
 278         {
 279           max = eax & 0xff;
 280           eax &= 0xffffff00;
 281         }
 282
 283       /* Process the individual registers' value.  */
 284       result = intel_check_word (name, eax, &has_level_2,
 285                                  &no_level_2_or_3, cpu_features);
 286       if (result != 0)
 287         return result;
 288
 289       result = intel_check_word (name, ebx, &has_level_2,
 290                                  &no_level_2_or_3, cpu_features);
 291       if (result != 0)
 292         return result;
 293
 294       result = intel_check_word (name, ecx, &has_level_2,
 295                                  &no_level_2_or_3, cpu_features);
 296       if (result != 0)
 297         return result;
 298
 299       result = intel_check_word (name, edx, &has_level_2,
 300                                  &no_level_2_or_3, cpu_features);
 301       if (result != 0)
 302         return result;
 303     }
 304
 305   if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
 306       && no_level_2_or_3)
 307     return -1;
 308
 309   return 0;
 310 }
 311
 312
 313 static long int __attribute__ ((noinline))
 314 handle_amd (int name)
 315 {
 316   unsigned int eax;
 317   unsigned int ebx;
 318   unsigned int ecx;
 319   unsigned int edx;
 320   unsigned int count = 0x1;
 321
 322   /* No level 4 cache (yet).  */
 323   if (name > _SC_LEVEL3_CACHE_LINESIZE)
 324     return 0;
 325
 326   if (name >= _SC_LEVEL3_CACHE_SIZE)
 327     count = 0x3;
 328   else if (name >= _SC_LEVEL2_CACHE_SIZE)
 329     count = 0x2;
 330   else if (name >= _SC_LEVEL1_DCACHE_SIZE)
 331     count = 0x0;
 332
 333   __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
 334
 335   switch (name)
 336     {
 337     case _SC_LEVEL1_ICACHE_ASSOC:
 338     case _SC_LEVEL1_DCACHE_ASSOC:
 339     case _SC_LEVEL2_CACHE_ASSOC:
 340     case _SC_LEVEL3_CACHE_ASSOC:
 341       return ecx ? ((ebx >> 22) & 0x3ff) + 1 : 0;
 342     case _SC_LEVEL1_ICACHE_LINESIZE:
 343     case _SC_LEVEL1_DCACHE_LINESIZE:
 344     case _SC_LEVEL2_CACHE_LINESIZE:
 345     case _SC_LEVEL3_CACHE_LINESIZE:
 346       return ecx ? (ebx & 0xfff) + 1 : 0;
 347     case _SC_LEVEL1_ICACHE_SIZE:
 348     case _SC_LEVEL1_DCACHE_SIZE:
 349     case _SC_LEVEL2_CACHE_SIZE:
 350     case _SC_LEVEL3_CACHE_SIZE:
 351       return ecx ? (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1): 0;
 352     default:
 353       __builtin_unreachable ();
 354     }
 355   return -1;
 356 }
 357
 358
 359 static long int __attribute__ ((noinline))
 360 handle_zhaoxin (int name)
 361 {
 362   unsigned int eax;
 363   unsigned int ebx;
 364   unsigned int ecx;
 365   unsigned int edx;
 366
 367   int folded_rel_name = (M(name) / 3) * 3;
 368
 369   unsigned int round = 0;
 370   while (1)
 371     {
 372       __cpuid_count (4, round, eax, ebx, ecx, edx);
 373
 374       enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
 375       if (type == null)
 376         break;
 377
 378       unsigned int level = (eax >> 5) & 0x7;
 379
 380       if ((level == 1 && type == data
 381         && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
 382         || (level == 1 && type == inst
 383             && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
 384         || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
 385         || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
 386         {
 387           unsigned int offset = M(name) - folded_rel_name;
 388
 389           if (offset == 0)
 390             /* Cache size.  */
 391             return (((ebx >> 22) + 1)
 392                 * (((ebx >> 12) & 0x3ff) + 1)
 393                 * ((ebx & 0xfff) + 1)
 394                 * (ecx + 1));
 395           if (offset == 1)
 396             return (ebx >> 22) + 1;
 397
 398           assert (offset == 2);
 399           return (ebx & 0xfff) + 1;
 400         }
 401
 402       ++round;
 403     }
 404
 405   /* Nothing found.  */
 406   return 0;
 407 }
 408
 409 static void
 410 get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
 411                 long int core)
 412 {
 413   unsigned int eax;
 414   unsigned int ebx;
 415   unsigned int ecx;
 416   unsigned int edx;
 417
 418   /* Number of logical processors sharing L2 cache.  */
 419   int threads_l2;
 420
 421   /* Number of logical processors sharing L3 cache.  */
 422   int threads_l3;
 423
 424   const struct cpu_features *cpu_features = __get_cpu_features ();
 425   int max_cpuid = cpu_features->basic.max_cpuid;
 426   unsigned int family = cpu_features->basic.family;
 427   unsigned int model = cpu_features->basic.model;
 428   long int shared = *shared_ptr;
 429   long int shared_per_thread = *shared_per_thread_ptr;
 430   unsigned int threads = *threads_ptr;
 431   bool inclusive_cache = true;
 432   bool support_count_mask = true;
 433
 434   /* Try L3 first.  */
 435   unsigned int level = 3;
 436
 437   if (cpu_features->basic.kind == arch_kind_zhaoxin && family == 6)
 438     support_count_mask = false;
 439
 440   if (shared <= 0)
 441     {
 442       /* Try L2 otherwise.  */
 443       level  = 2;
 444       shared = core;
 445       shared_per_thread = core;
 446       threads_l2 = 0;
 447       threads_l3 = -1;
 448     }
 449   else
 450     {
 451       threads_l2 = 0;
 452       threads_l3 = 0;
 453     }
 454
 455   /* A value of 0 for the HTT bit indicates there is only a single
 456      logical processor.  */
 457   if (HAS_CPU_FEATURE (HTT))
 458     {
 459       /* Figure out the number of logical threads that share the
 460          highest cache level.  */
 461       if (max_cpuid >= 4)
 462         {
 463           int i = 0;
 464
 465           /* Query until cache level 2 and 3 are enumerated.  */
 466           int check = 0x1 | (threads_l3 == 0) << 1;
 467           do
 468             {
 469               __cpuid_count (4, i++, eax, ebx, ecx, edx);
 470
 471               /* There seems to be a bug in at least some Pentium Ds
 472                  which sometimes fail to iterate all cache parameters.
 473                  Do not loop indefinitely here, stop in this case and
 474                  assume there is no such information.  */
 475               if (cpu_features->basic.kind == arch_kind_intel
 476                   && (eax & 0x1f) == 0 )
 477                 goto intel_bug_no_cache_info;
 478
 479               switch ((eax >> 5) & 0x7)
 480                 {
 481                   default:
 482                     break;
 483                   case 2:
 484                     if ((check & 0x1))
 485                       {
 486                         /* Get maximum number of logical processors
 487                            sharing L2 cache.  */
 488                         threads_l2 = (eax >> 14) & 0x3ff;
 489                         check &= ~0x1;
 490                       }
 491                     break;
 492                   case 3:
 493                     if ((check & (0x1 << 1)))
 494                       {
 495                         /* Get maximum number of logical processors
 496                            sharing L3 cache.  */
 497                         threads_l3 = (eax >> 14) & 0x3ff;
 498
 499                         /* Check if L2 and L3 caches are inclusive.  */
 500                         inclusive_cache = (edx & 0x2) != 0;
 501                         check &= ~(0x1 << 1);
 502                       }
 503                     break;
 504                 }
 505             }
 506           while (check);
 507
 508           /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
 509              numbers of addressable IDs for logical processors sharing
 510              the cache, instead of the maximum number of threads
 511              sharing the cache.  */
 512           if (max_cpuid >= 11 && support_count_mask)
 513             {
 514               /* Find the number of logical processors shipped in
 515                  one core and apply count mask.  */
 516               i = 0;
 517
 518               /* Count SMT only if there is L3 cache.  Always count
 519                  core if there is no L3 cache.  */
 520               int count = ((threads_l2 > 0 && level == 3)
 521                            | ((threads_l3 > 0
 522                                || (threads_l2 > 0 && level == 2)) << 1));
 523
 524               while (count)
 525                 {
 526                   __cpuid_count (11, i++, eax, ebx, ecx, edx);
 527
 528                   int shipped = ebx & 0xff;
 529                   int type = ecx & 0xff00;
 530                   if (shipped == 0 || type == 0)
 531                     break;
 532                   else if (type == 0x100)
 533                     {
 534                       /* Count SMT.  */
 535                       if ((count & 0x1))
 536                         {
 537                           int count_mask;
 538
 539                           /* Compute count mask.  */
 540                           asm ("bsr %1, %0"
 541                                : "=r" (count_mask) : "g" (threads_l2));
 542                           count_mask = ~(-1 << (count_mask + 1));
 543                           threads_l2 = (shipped - 1) & count_mask;
 544                           count &= ~0x1;
 545                         }
 546                     }
 547                   else if (type == 0x200)
 548                     {
 549                       /* Count core.  */
 550                       if ((count & (0x1 << 1)))
 551                         {
 552                           int count_mask;
 553                           int threads_core
 554                             = (level == 2 ? threads_l2 : threads_l3);
 555
 556                           /* Compute count mask.  */
 557                           asm ("bsr %1, %0"
 558                                : "=r" (count_mask) : "g" (threads_core));
 559                           count_mask = ~(-1 << (count_mask + 1));
 560                           threads_core = (shipped - 1) & count_mask;
 561                           if (level == 2)
 562                             threads_l2 = threads_core;
 563                           else
 564                             threads_l3 = threads_core;
 565                           count &= ~(0x1 << 1);
 566                         }
 567                     }
 568                 }
 569             }
 570           if (threads_l2 > 0)
 571             threads_l2 += 1;
 572           if (threads_l3 > 0)
 573             threads_l3 += 1;
 574           if (level == 2)
 575             {
 576               if (threads_l2)
 577                 {
 578                   threads = threads_l2;
 579                   if (cpu_features->basic.kind == arch_kind_intel
 580                       && threads > 2
 581                       && family == 6)
 582                     switch (model)
 583                       {
 584                         case 0x37:
 585                         case 0x4a:
 586                         case 0x4d:
 587                         case 0x5a:
 588                         case 0x5d:
 589                           /* Silvermont has L2 cache shared by 2 cores.  */
 590                           threads = 2;
 591                           break;
 592                         default:
 593                           break;
 594                       }
 595                 }
 596             }
 597           else if (threads_l3)
 598             threads = threads_l3;
 599         }
 600       else
 601         {
 602         intel_bug_no_cache_info:
 603           /* Assume that all logical threads share the highest cache
 604              level.  */
 605           threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16)
 606                      & 0xff);
 607
 608           /* Get per-thread size of highest level cache.  */
 609           if (shared_per_thread > 0 && threads > 0)
 610             shared_per_thread /= threads;
 611         }
 612     }
 613
 614   /* Account for non-inclusive L2 and L3 caches.  */
 615   if (!inclusive_cache)
 616     {
 617       if (threads_l2 > 0)
 618         shared_per_thread += core / threads_l2;
 619       shared += core;
 620     }
 621
 622   *shared_ptr = shared;
 623   *shared_per_thread_ptr = shared_per_thread;
 624   *threads_ptr = threads;
 625 }
 626
 627 static void
 628 dl_init_cacheinfo (struct cpu_features *cpu_features)
 629 {
 630   /* Find out what brand of processor.  */
 631   long int data = -1;
 632   long int shared = -1;
 633   long int shared_per_thread = -1;
 634   long int core = -1;
 635   unsigned int threads = 0;
 636   unsigned long int level1_icache_size = -1;
 637   unsigned long int level1_icache_linesize = -1;
 638   unsigned long int level1_dcache_size = -1;
 639   unsigned long int level1_dcache_assoc = -1;
 640   unsigned long int level1_dcache_linesize = -1;
 641   unsigned long int level2_cache_size = -1;
 642   unsigned long int level2_cache_assoc = -1;
 643   unsigned long int level2_cache_linesize = -1;
 644   unsigned long int level3_cache_size = -1;
 645   unsigned long int level3_cache_assoc = -1;
 646   unsigned long int level3_cache_linesize = -1;
 647   unsigned long int level4_cache_size = -1;
 648
 649   if (cpu_features->basic.kind == arch_kind_intel)
 650     {
 651       data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
 652       core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
 653       shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
 654       shared_per_thread = shared;
 655
 656       level1_icache_size
 657         = handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
 658       level1_icache_linesize
 659         = handle_intel (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
 660       level1_dcache_size = data;
 661       level1_dcache_assoc
 662         = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
 663       level1_dcache_linesize
 664         = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
 665       level2_cache_size = core;
 666       level2_cache_assoc
 667         = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
 668       level2_cache_linesize
 669         = handle_intel (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
 670       level3_cache_size = shared;
 671       level3_cache_assoc
 672         = handle_intel (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
 673       level3_cache_linesize
 674         = handle_intel (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
 675       level4_cache_size
 676         = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
 677
 678       get_common_cache_info (&shared, &shared_per_thread, &threads, core);
 679     }
 680   else if (cpu_features->basic.kind == arch_kind_zhaoxin)
 681     {
 682       data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
 683       core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
 684       shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
 685       shared_per_thread = shared;
 686
 687       level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
 688       level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
 689       level1_dcache_size = data;
 690       level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
 691       level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
 692       level2_cache_size = core;
 693       level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
 694       level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
 695       level3_cache_size = shared;
 696       level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
 697       level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
 698
 699       get_common_cache_info (&shared, &shared_per_thread, &threads, core);
 700     }
 701   else if (cpu_features->basic.kind == arch_kind_amd)
 702     {
 703       data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
 704       core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
 705       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
 706       shared_per_thread = shared;
 707
 708       level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
 709       level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE);
 710       level1_dcache_size = data;
 711       level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
 712       level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
 713       level2_cache_size = core;
 714       level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
 715       level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
 716       level3_cache_size = shared;
 717       level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC);
 718       level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE);
 719
 720       if (shared <= 0)
 721         /* No shared L3 cache.  All we have is the L2 cache.  */
 722         shared = core;
 723
 724       if (shared_per_thread <= 0)
 725         shared_per_thread = shared;
 726     }
 727
 728   cpu_features->level1_icache_size = level1_icache_size;
 729   cpu_features->level1_icache_linesize = level1_icache_linesize;
 730   cpu_features->level1_dcache_size = level1_dcache_size;
 731   cpu_features->level1_dcache_assoc = level1_dcache_assoc;
 732   cpu_features->level1_dcache_linesize = level1_dcache_linesize;
 733   cpu_features->level2_cache_size = level2_cache_size;
 734   cpu_features->level2_cache_assoc = level2_cache_assoc;
 735   cpu_features->level2_cache_linesize = level2_cache_linesize;
 736   cpu_features->level3_cache_size = level3_cache_size;
 737   cpu_features->level3_cache_assoc = level3_cache_assoc;
 738   cpu_features->level3_cache_linesize = level3_cache_linesize;
 739   cpu_features->level4_cache_size = level4_cache_size;
 740
 741   unsigned long int cachesize_non_temporal_divisor
 742       = cpu_features->cachesize_non_temporal_divisor;
 743   if (cachesize_non_temporal_divisor <= 0)
 744     cachesize_non_temporal_divisor = 4;
 745
 746   /* The default setting for the non_temporal threshold is [1/8, 1/2] of size
 747      of the chip's cache (depending on `cachesize_non_temporal_divisor` which
 748      is microarch specific. The default is 1/4). For most Intel and AMD
 749      processors with an initial release date between 2017 and 2023, a thread's
 750      typical share of the cache is from 18-64MB. Using a reasonable size
 751      fraction of L3 is meant to estimate the point where non-temporal stores
 752      begin out-competing REP MOVSB. As well the point where the fact that
 753      non-temporal stores are forced back to main memory would already occurred
 754      to the majority of the lines in the copy. Note, concerns about the entire
 755      L3 cache being evicted by the copy are mostly alleviated by the fact that
 756      modern HW detects streaming patterns and provides proper LRU hints so that
 757      the maximum thrashing capped at 1/associativity. */
 758   unsigned long int non_temporal_threshold
 759       = shared / cachesize_non_temporal_divisor;
 760   /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
 761      a higher risk of actually thrashing the cache as they don't have a HW LRU
 762      hint. As well, their performance in highly parallel situations is
 763      noticeably worse.  */
 764   if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 765     non_temporal_threshold = shared_per_thread * 3 / 4;
 766   /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
 767      'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
 768      if that operation cannot overflow. Minimum of 0x4040 (16448) because the
 769      L(large_memset_4x) loops need 64-byte to cache align and enough space for
 770      at least 1 iteration of 4x PAGE_SIZE unrolled loop.  Both values are
 771      reflected in the manual.  */
 772   unsigned long int maximum_non_temporal_threshold = SIZE_MAX >> 4;
 773   unsigned long int minimum_non_temporal_threshold = 0x4040;
 774
 775   /* If `non_temporal_threshold` less than `minimum_non_temporal_threshold`
 776      it most likely means we failed to detect the cache info. We don't want
 777      to default to `minimum_non_temporal_threshold` as such a small value,
 778      while correct, has bad performance. We default to 64MB as reasonable
 779      default bound. 64MB is likely conservative in that most/all systems would
 780      choose a lower value so it should never forcing non-temporal stores when
 781      they otherwise wouldn't be used.  */
 782   if (non_temporal_threshold < minimum_non_temporal_threshold)
 783     non_temporal_threshold = 64 * 1024 * 1024;
 784   else if (non_temporal_threshold > maximum_non_temporal_threshold)
 785     non_temporal_threshold = maximum_non_temporal_threshold;
 786
 787   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
 788   unsigned int minimum_rep_movsb_threshold;
 789   /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
 790      VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
 791      threshold is 2048 * (VEC_SIZE / 16).  */
 792   unsigned int rep_movsb_threshold;
 793   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
 794       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
 795     {
 796       rep_movsb_threshold = 4096 * (64 / 16);
 797       minimum_rep_movsb_threshold = 64 * 8;
 798     }
 799   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
 800                                     AVX_Fast_Unaligned_Load))
 801     {
 802       rep_movsb_threshold = 4096 * (32 / 16);
 803       minimum_rep_movsb_threshold = 32 * 8;
 804     }
 805   else
 806     {
 807       rep_movsb_threshold = 2048 * (16 / 16);
 808       minimum_rep_movsb_threshold = 16 * 8;
 809     }
 810   /* NB: The default REP MOVSB threshold is 2112 on processors with fast
 811      short REP MOVSB (FSRM).  */
 812   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
 813     rep_movsb_threshold = 2112;
 814
 815   /* The default threshold to use Enhanced REP STOSB.  */
 816   unsigned long int rep_stosb_threshold = 2048;
 817
 818   long int tunable_size;
 819
 820   tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
 821   /* NB: Ignore the default value 0.  */
 822   if (tunable_size != 0)
 823     data = tunable_size;
 824
 825   tunable_size = TUNABLE_GET (x86_shared_cache_size, long int, NULL);
 826   /* NB: Ignore the default value 0.  */
 827   if (tunable_size != 0)
 828     shared = tunable_size;
 829
 830   tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
 831   if (tunable_size > minimum_non_temporal_threshold
 832       && tunable_size <= maximum_non_temporal_threshold)
 833     non_temporal_threshold = tunable_size;
 834
 835   tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
 836   if (tunable_size > minimum_rep_movsb_threshold)
 837     rep_movsb_threshold = tunable_size;
 838
 839   /* NB: The default value of the x86_rep_stosb_threshold tunable is the
 840      same as the default value of __x86_rep_stosb_threshold and the
 841      minimum value is fixed.  */
 842   rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
 843                                      long int, NULL);
 844
 845   TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
 846   TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
 847   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
 848                            minimum_non_temporal_threshold,
 849                            maximum_non_temporal_threshold);
 850   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
 851                            minimum_rep_movsb_threshold, SIZE_MAX);
 852   TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
 853                            SIZE_MAX);
 854
 855   unsigned long int rep_movsb_stop_threshold;
 856   /* ERMS feature is implemented from AMD Zen3 architecture and it is
 857      performing poorly for data above L2 cache size. Henceforth, adding
 858      an upper bound threshold parameter to limit the usage of Enhanced
 859      REP MOVSB operations and setting its value to L2 cache size.  */
 860   if (cpu_features->basic.kind == arch_kind_amd)
 861     rep_movsb_stop_threshold = core;
 862   /* Setting the upper bound of ERMS to the computed value of
 863      non-temporal threshold for architectures other than AMD.  */
 864   else
 865     rep_movsb_stop_threshold = non_temporal_threshold;
 866
 867   cpu_features->data_cache_size = data;
 868   cpu_features->shared_cache_size = shared;
 869   cpu_features->non_temporal_threshold = non_temporal_threshold;
 870   cpu_features->rep_movsb_threshold = rep_movsb_threshold;
 871   cpu_features->rep_stosb_threshold = rep_stosb_threshold;
 872   cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
 873 }