2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
24 #include <init-arch.h>
26 #define is_intel GLRO(dl_x86_cpu_features).kind == arch_kind_intel
27 #define is_amd GLRO(dl_x86_cpu_features).kind == arch_kind_amd
28 #define max_cpuid GLRO(dl_x86_cpu_features).max_cpuid
30 static const struct intel_02_cache_info
34 unsigned char linesize
;
35 unsigned char rel_name
;
39 #define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
40 { 0x06, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE
), 8192 },
41 { 0x08, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE
), 16384 },
42 { 0x09, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE
), 32768 },
43 { 0x0a, 2, 32, M(_SC_LEVEL1_DCACHE_SIZE
), 8192 },
44 { 0x0c, 4, 32, M(_SC_LEVEL1_DCACHE_SIZE
), 16384 },
45 { 0x0d, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE
), 16384 },
46 { 0x0e, 6, 64, M(_SC_LEVEL1_DCACHE_SIZE
), 24576 },
47 { 0x21, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 262144 },
48 { 0x22, 4, 64, M(_SC_LEVEL3_CACHE_SIZE
), 524288 },
49 { 0x23, 8, 64, M(_SC_LEVEL3_CACHE_SIZE
), 1048576 },
50 { 0x25, 8, 64, M(_SC_LEVEL3_CACHE_SIZE
), 2097152 },
51 { 0x29, 8, 64, M(_SC_LEVEL3_CACHE_SIZE
), 4194304 },
52 { 0x2c, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE
), 32768 },
53 { 0x30, 8, 64, M(_SC_LEVEL1_ICACHE_SIZE
), 32768 },
54 { 0x39, 4, 64, M(_SC_LEVEL2_CACHE_SIZE
), 131072 },
55 { 0x3a, 6, 64, M(_SC_LEVEL2_CACHE_SIZE
), 196608 },
56 { 0x3b, 2, 64, M(_SC_LEVEL2_CACHE_SIZE
), 131072 },
57 { 0x3c, 4, 64, M(_SC_LEVEL2_CACHE_SIZE
), 262144 },
58 { 0x3d, 6, 64, M(_SC_LEVEL2_CACHE_SIZE
), 393216 },
59 { 0x3e, 4, 64, M(_SC_LEVEL2_CACHE_SIZE
), 524288 },
60 { 0x3f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE
), 262144 },
61 { 0x41, 4, 32, M(_SC_LEVEL2_CACHE_SIZE
), 131072 },
62 { 0x42, 4, 32, M(_SC_LEVEL2_CACHE_SIZE
), 262144 },
63 { 0x43, 4, 32, M(_SC_LEVEL2_CACHE_SIZE
), 524288 },
64 { 0x44, 4, 32, M(_SC_LEVEL2_CACHE_SIZE
), 1048576 },
65 { 0x45, 4, 32, M(_SC_LEVEL2_CACHE_SIZE
), 2097152 },
66 { 0x46, 4, 64, M(_SC_LEVEL3_CACHE_SIZE
), 4194304 },
67 { 0x47, 8, 64, M(_SC_LEVEL3_CACHE_SIZE
), 8388608 },
68 { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE
), 3145728 },
69 { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE
), 4194304 },
70 { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE
), 6291456 },
71 { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE
), 8388608 },
72 { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE
), 12582912 },
73 { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE
), 16777216 },
74 { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE
), 6291456 },
75 { 0x60, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE
), 16384 },
76 { 0x66, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE
), 8192 },
77 { 0x67, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE
), 16384 },
78 { 0x68, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE
), 32768 },
79 { 0x78, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 1048576 },
80 { 0x79, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 131072 },
81 { 0x7a, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 262144 },
82 { 0x7b, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 524288 },
83 { 0x7c, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 1048576 },
84 { 0x7d, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 2097152 },
85 { 0x7f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE
), 524288 },
86 { 0x80, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 524288 },
87 { 0x82, 8, 32, M(_SC_LEVEL2_CACHE_SIZE
), 262144 },
88 { 0x83, 8, 32, M(_SC_LEVEL2_CACHE_SIZE
), 524288 },
89 { 0x84, 8, 32, M(_SC_LEVEL2_CACHE_SIZE
), 1048576 },
90 { 0x85, 8, 32, M(_SC_LEVEL2_CACHE_SIZE
), 2097152 },
91 { 0x86, 4, 64, M(_SC_LEVEL2_CACHE_SIZE
), 524288 },
92 { 0x87, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 1048576 },
93 { 0xd0, 4, 64, M(_SC_LEVEL3_CACHE_SIZE
), 524288 },
94 { 0xd1, 4, 64, M(_SC_LEVEL3_CACHE_SIZE
), 1048576 },
95 { 0xd2, 4, 64, M(_SC_LEVEL3_CACHE_SIZE
), 2097152 },
96 { 0xd6, 8, 64, M(_SC_LEVEL3_CACHE_SIZE
), 1048576 },
97 { 0xd7, 8, 64, M(_SC_LEVEL3_CACHE_SIZE
), 2097152 },
98 { 0xd8, 8, 64, M(_SC_LEVEL3_CACHE_SIZE
), 4194304 },
99 { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE
), 2097152 },
100 { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE
), 4194304 },
101 { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE
), 8388608 },
102 { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE
), 2097152 },
103 { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE
), 4194304 },
104 { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE
), 8388608 },
105 { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE
), 12582912 },
106 { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE
), 18874368 },
107 { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE
), 25165824 },
110 #define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
113 intel_02_known_compare (const void *p1
, const void *p2
)
115 const struct intel_02_cache_info
*i1
;
116 const struct intel_02_cache_info
*i2
;
118 i1
= (const struct intel_02_cache_info
*) p1
;
119 i2
= (const struct intel_02_cache_info
*) p2
;
121 if (i1
->idx
== i2
->idx
)
124 return i1
->idx
< i2
->idx
? -1 : 1;
129 __attribute__ ((noinline
))
130 intel_check_word (int name
, unsigned int value
, bool *has_level_2
,
131 bool *no_level_2_or_3
)
133 if ((value
& 0x80000000) != 0)
134 /* The register value is reserved. */
137 /* Fold the name. The _SC_ constants are always in the order SIZE,
139 int folded_rel_name
= (M(name
) / 3) * 3;
143 unsigned int byte
= value
& 0xff;
147 *no_level_2_or_3
= true;
149 if (folded_rel_name
== M(_SC_LEVEL3_CACHE_SIZE
))
150 /* No need to look further. */
153 else if (byte
== 0xff)
155 /* CPUID leaf 0x4 contains all the information. We need to
162 unsigned int round
= 0;
165 __cpuid_count (4, round
, eax
, ebx
, ecx
, edx
);
167 enum { null
= 0, data
= 1, inst
= 2, uni
= 3 } type
= eax
& 0x1f;
169 /* That was the end. */
172 unsigned int level
= (eax
>> 5) & 0x7;
174 if ((level
== 1 && type
== data
175 && folded_rel_name
== M(_SC_LEVEL1_DCACHE_SIZE
))
176 || (level
== 1 && type
== inst
177 && folded_rel_name
== M(_SC_LEVEL1_ICACHE_SIZE
))
178 || (level
== 2 && folded_rel_name
== M(_SC_LEVEL2_CACHE_SIZE
))
179 || (level
== 3 && folded_rel_name
== M(_SC_LEVEL3_CACHE_SIZE
))
180 || (level
== 4 && folded_rel_name
== M(_SC_LEVEL4_CACHE_SIZE
)))
182 unsigned int offset
= M(name
) - folded_rel_name
;
186 return (((ebx
>> 22) + 1)
187 * (((ebx
>> 12) & 0x3ff) + 1)
188 * ((ebx
& 0xfff) + 1)
191 return (ebx
>> 22) + 1;
193 assert (offset
== 2);
194 return (ebx
& 0xfff) + 1;
199 /* There is no other cache information anywhere else. */
204 if (byte
== 0x49 && folded_rel_name
== M(_SC_LEVEL3_CACHE_SIZE
))
206 /* Intel reused this value. For family 15, model 6 it
207 specifies the 3rd level cache. Otherwise the 2nd
209 unsigned int family
= GLRO(dl_x86_cpu_features
).family
;
210 unsigned int model
= GLRO(dl_x86_cpu_features
).model
;
212 if (family
== 15 && model
== 6)
214 /* The level 3 cache is encoded for this model like
215 the level 2 cache is for other models. Pretend
216 the caller asked for the level 2 cache. */
217 name
= (_SC_LEVEL2_CACHE_SIZE
218 + (name
- _SC_LEVEL3_CACHE_SIZE
));
219 folded_rel_name
= M(_SC_LEVEL2_CACHE_SIZE
);
223 struct intel_02_cache_info
*found
;
224 struct intel_02_cache_info search
;
227 found
= bsearch (&search
, intel_02_known
, nintel_02_known
,
228 sizeof (intel_02_known
[0]), intel_02_known_compare
);
231 if (found
->rel_name
== folded_rel_name
)
233 unsigned int offset
= M(name
) - folded_rel_name
;
241 assert (offset
== 2);
242 return found
->linesize
;
245 if (found
->rel_name
== M(_SC_LEVEL2_CACHE_SIZE
))
250 /* Next byte for the next round. */
259 static long int __attribute__ ((noinline
))
260 handle_intel (int name
, unsigned int maxidx
)
262 /* Return -1 for older CPUs. */
266 /* OK, we can use the CPUID instruction to get all info about the
268 unsigned int cnt
= 0;
269 unsigned int max
= 1;
271 bool no_level_2_or_3
= false;
272 bool has_level_2
= false;
280 __cpuid (2, eax
, ebx
, ecx
, edx
);
282 /* The low byte of EAX in the first round contain the number of
283 rounds we have to make. At least one, the one we are already
291 /* Process the individual registers' value. */
292 result
= intel_check_word (name
, eax
, &has_level_2
, &no_level_2_or_3
);
296 result
= intel_check_word (name
, ebx
, &has_level_2
, &no_level_2_or_3
);
300 result
= intel_check_word (name
, ecx
, &has_level_2
, &no_level_2_or_3
);
304 result
= intel_check_word (name
, edx
, &has_level_2
, &no_level_2_or_3
);
309 if (name
>= _SC_LEVEL2_CACHE_SIZE
&& name
<= _SC_LEVEL3_CACHE_LINESIZE
317 static long int __attribute__ ((noinline
))
318 handle_amd (int name
)
324 __cpuid (0x80000000, eax
, ebx
, ecx
, edx
);
326 /* No level 4 cache (yet). */
327 if (name
> _SC_LEVEL3_CACHE_LINESIZE
)
330 unsigned int fn
= 0x80000005 + (name
>= _SC_LEVEL2_CACHE_SIZE
);
334 __cpuid (fn
, eax
, ebx
, ecx
, edx
);
336 if (name
< _SC_LEVEL1_DCACHE_SIZE
)
338 name
+= _SC_LEVEL1_DCACHE_SIZE
- _SC_LEVEL1_ICACHE_SIZE
;
344 case _SC_LEVEL1_DCACHE_SIZE
:
345 return (ecx
>> 14) & 0x3fc00;
347 case _SC_LEVEL1_DCACHE_ASSOC
:
349 if ((ecx
& 0xff) == 0xff)
350 /* Fully associative. */
351 return (ecx
<< 2) & 0x3fc00;
354 case _SC_LEVEL1_DCACHE_LINESIZE
:
357 case _SC_LEVEL2_CACHE_SIZE
:
358 return (ecx
& 0xf000) == 0 ? 0 : (ecx
>> 6) & 0x3fffc00;
360 case _SC_LEVEL2_CACHE_ASSOC
:
361 switch ((ecx
>> 12) & 0xf)
367 return (ecx
>> 12) & 0xf;
383 return ((ecx
>> 6) & 0x3fffc00) / (ecx
& 0xff);
389 case _SC_LEVEL2_CACHE_LINESIZE
:
390 return (ecx
& 0xf000) == 0 ? 0 : ecx
& 0xff;
392 case _SC_LEVEL3_CACHE_SIZE
:
393 return (edx
& 0xf000) == 0 ? 0 : (edx
& 0x3ffc0000) << 1;
395 case _SC_LEVEL3_CACHE_ASSOC
:
396 switch ((edx
>> 12) & 0xf)
402 return (edx
>> 12) & 0xf;
418 return ((edx
& 0x3ffc0000) << 1) / (edx
& 0xff);
424 case _SC_LEVEL3_CACHE_LINESIZE
:
425 return (edx
& 0xf000) == 0 ? 0 : edx
& 0xff;
428 assert (! "cannot happen");
434 /* Get the value of the system variable NAME. */
437 __cache_sysconf (int name
)
440 return handle_intel (name
, max_cpuid
);
443 return handle_amd (name
);
445 // XXX Fill in more vendors.
447 /* CPU not known, we have no information. */
452 /* Data cache size for use in memory and string routines, typically
453 L1 size, rounded to multiple of 256 bytes. */
454 long int __x86_data_cache_size_half attribute_hidden
= 32 * 1024 / 2;
455 long int __x86_data_cache_size attribute_hidden
= 32 * 1024;
456 /* Similar to __x86_data_cache_size_half, but not rounded. */
457 long int __x86_raw_data_cache_size_half attribute_hidden
= 32 * 1024 / 2;
458 /* Similar to __x86_data_cache_size, but not rounded. */
459 long int __x86_raw_data_cache_size attribute_hidden
= 32 * 1024;
460 /* Shared cache size for use in memory and string routines, typically
461 L2 or L3 size, rounded to multiple of 256 bytes. */
462 long int __x86_shared_cache_size_half attribute_hidden
= 1024 * 1024 / 2;
463 long int __x86_shared_cache_size attribute_hidden
= 1024 * 1024;
464 /* Similar to __x86_shared_cache_size_half, but not rounded. */
465 long int __x86_raw_shared_cache_size_half attribute_hidden
= 1024 * 1024 / 2;
466 /* Similar to __x86_shared_cache_size, but not rounded. */
467 long int __x86_raw_shared_cache_size attribute_hidden
= 1024 * 1024;
469 /* Threshold to use non temporal store. */
470 long int __x86_shared_non_temporal_threshold attribute_hidden
;
472 #ifndef DISABLE_PREFETCHW
473 /* PREFETCHW support flag for use in memory and string routines. */
474 int __x86_prefetchw attribute_hidden
;
479 __attribute__((constructor
))
480 init_cacheinfo (void)
482 /* Find out what brand of processor. */
489 long int shared
= -1;
491 unsigned int threads
= 0;
495 data
= handle_intel (_SC_LEVEL1_DCACHE_SIZE
, max_cpuid
);
497 long int core
= handle_intel (_SC_LEVEL2_CACHE_SIZE
, max_cpuid
);
498 bool inclusive_cache
= true;
502 shared
= handle_intel (_SC_LEVEL3_CACHE_SIZE
, max_cpuid
);
504 /* Number of logical processors sharing L2 cache. */
507 /* Number of logical processors sharing L3 cache. */
512 /* Try L2 otherwise. */
524 /* A value of 0 for the HTT bit indicates there is only a single
525 logical processor. */
526 if (HAS_CPU_FEATURE (HTT
))
528 /* Figure out the number of logical threads that share the
529 highest cache level. */
532 unsigned int family
= GLRO(dl_x86_cpu_features
).family
;
533 unsigned int model
= GLRO(dl_x86_cpu_features
).model
;
537 /* Query until cache level 2 and 3 are enumerated. */
538 int check
= 0x1 | (threads_l3
== 0) << 1;
541 __cpuid_count (4, i
++, eax
, ebx
, ecx
, edx
);
543 /* There seems to be a bug in at least some Pentium Ds
544 which sometimes fail to iterate all cache parameters.
545 Do not loop indefinitely here, stop in this case and
546 assume there is no such information. */
547 if ((eax
& 0x1f) == 0)
548 goto intel_bug_no_cache_info
;
550 switch ((eax
>> 5) & 0x7)
557 /* Get maximum number of logical processors
559 threads_l2
= (eax
>> 14) & 0x3ff;
564 if ((check
& (0x1 << 1)))
566 /* Get maximum number of logical processors
568 threads_l3
= (eax
>> 14) & 0x3ff;
570 /* Check if L2 and L3 caches are inclusive. */
571 inclusive_cache
= (edx
& 0x2) != 0;
572 check
&= ~(0x1 << 1);
579 /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
580 numbers of addressable IDs for logical processors sharing
581 the cache, instead of the maximum number of threads
582 sharing the cache. */
585 /* Find the number of logical processors shipped in
586 one core and apply count mask. */
589 /* Count SMT only if there is L3 cache. Always count
590 core if there is no L3 cache. */
591 int count
= ((threads_l2
> 0 && level
== 3)
593 || (threads_l2
> 0 && level
== 2)) << 1));
597 __cpuid_count (11, i
++, eax
, ebx
, ecx
, edx
);
599 int shipped
= ebx
& 0xff;
600 int type
= ecx
& 0xff00;
601 if (shipped
== 0 || type
== 0)
603 else if (type
== 0x100)
610 /* Compute count mask. */
612 : "=r" (count_mask
) : "g" (threads_l2
));
613 count_mask
= ~(-1 << (count_mask
+ 1));
614 threads_l2
= (shipped
- 1) & count_mask
;
618 else if (type
== 0x200)
621 if ((count
& (0x1 << 1)))
625 = (level
== 2 ? threads_l2
: threads_l3
);
627 /* Compute count mask. */
629 : "=r" (count_mask
) : "g" (threads_core
));
630 count_mask
= ~(-1 << (count_mask
+ 1));
631 threads_core
= (shipped
- 1) & count_mask
;
633 threads_l2
= threads_core
;
635 threads_l3
= threads_core
;
636 count
&= ~(0x1 << 1);
649 threads
= threads_l2
;
650 if (threads
> 2 && family
== 6)
658 /* Silvermont has L2 cache shared by 2 cores. */
667 threads
= threads_l3
;
671 intel_bug_no_cache_info
:
672 /* Assume that all logical threads share the highest cache
676 = ((GLRO(dl_x86_cpu_features
).cpuid
[COMMON_CPUID_INDEX_1
].ebx
680 /* Cap usage of highest cache level to the number of supported
682 if (shared
> 0 && threads
> 0)
686 /* Account for non-inclusive L2 and L3 caches. */
687 if (!inclusive_cache
)
694 /* This spells out "AuthenticAMD". */
697 data
= handle_amd (_SC_LEVEL1_DCACHE_SIZE
);
698 long int core
= handle_amd (_SC_LEVEL2_CACHE_SIZE
);
699 shared
= handle_amd (_SC_LEVEL3_CACHE_SIZE
);
701 /* Get maximum extended function. */
702 __cpuid (0x80000000, max_cpuid_ex
, ebx
, ecx
, edx
);
705 /* No shared L3 cache. All we have is the L2 cache. */
709 /* Figure out the number of logical threads that share L3. */
710 if (max_cpuid_ex
>= 0x80000008)
712 /* Get width of APIC ID. */
713 __cpuid (0x80000008, max_cpuid_ex
, ebx
, ecx
, edx
);
714 threads
= 1 << ((ecx
>> 12) & 0x0f);
719 /* If APIC ID width is not available, use logical
721 __cpuid (0x00000001, max_cpuid_ex
, ebx
, ecx
, edx
);
723 if ((edx
& (1 << 28)) != 0)
724 threads
= (ebx
>> 16) & 0xff;
727 /* Cap usage of highest cache level to the number of
728 supported threads. */
732 /* Account for exclusive L2 and L3 caches. */
736 #ifndef DISABLE_PREFETCHW
737 if (max_cpuid_ex
>= 0x80000001)
739 __cpuid (0x80000001, eax
, ebx
, ecx
, edx
);
740 /* PREFETCHW || 3DNow! */
741 if ((ecx
& 0x100) || (edx
& 0x80000000))
742 __x86_prefetchw
= -1;
749 __x86_raw_data_cache_size_half
= data
/ 2;
750 __x86_raw_data_cache_size
= data
;
751 /* Round data cache size to multiple of 256 bytes. */
753 __x86_data_cache_size_half
= data
/ 2;
754 __x86_data_cache_size
= data
;
759 __x86_raw_shared_cache_size_half
= shared
/ 2;
760 __x86_raw_shared_cache_size
= shared
;
761 /* Round shared cache size to multiple of 256 bytes. */
762 shared
= shared
& ~255L;
763 __x86_shared_cache_size_half
= shared
/ 2;
764 __x86_shared_cache_size
= shared
;
767 /* The large memcpy micro benchmark in glibc shows that 6 times of
768 shared cache size is the approximate value above which non-temporal
769 store becomes faster. */
770 __x86_shared_non_temporal_threshold
= __x86_shared_cache_size
* 6;