2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * Implements gmx::CpuInfo.
40 * We need to be able to compile this file in stand-alone mode to use basic
41 * CPU feature detection to set the SIMD acceleration and similar things in
42 * CMake, while we still want to use more features that enable topology
43 * detection when config.h is present.
45 * We solve this by skipping the advanced stuff when the preprocessor
46 * macro GMX_CPUINFO_STANDALONE is defined. In this case you likely also need to
47 * define GMX_X86_GCC_INLINE_ASM if you are on x86; without inline assembly
48 * support it is not possible to perform the actual detection on Linux/Mac.
49 * Since these macros are specific to this file, they do not use the GMX prefix.
51 * The remaining defines (GMX_NATIVE_WINDOWS,HAVE_UNISTD_H,HAVE_SCHED_H,
52 * HAVE_SYSCONF, HAVE_SCHED_AFFINITY) are only used to determine the topology on
53 * 86, and for this we rely on including config.h.
55 * \author Erik Lindahl <erik.lindahl@gmail.com>
56 * \ingroup module_hardware
59 #ifndef GMX_CPUINFO_STANDALONE
65 #ifndef GMX_CPUINFO_STANDALONE
68 # define GMX_NATIVE_WINDOWS 0
72 # include <intrin.h> // __cpuid()
75 #if GMX_NATIVE_WINDOWS
76 # include <windows.h> // sysinfo(), necessary for topology stuff
80 # include <sched.h> // sched_getaffinity(), sched_setaffinity()
83 # include <unistd.h> // sysconf()
96 #ifdef GMX_CPUINFO_STANDALONE
99 # include "gromacs/utility/basedefinitions.h"
102 #include "architecture.h"
110 /*! \cond internal */
112 /******************************************************************************
114 * Utility functions to make this file independent of the GROMACS library *
116 ******************************************************************************/
118 /*! \brief Remove initial and trailing whitespace from string
120 * \param s Pointer to string where whitespace will be removed
123 trimString(std::string
* s
)
126 s
->erase(s
->begin(), std::find_if(s
->begin(), s
->end(), [](char &c
) -> bool { return !std::isspace(c
); }));
128 s
->erase(std::find_if(s
->rbegin(), s
->rend(), [](char &c
) -> bool { return !std::isspace(c
); }).base(), s
->end());
132 /******************************************************************************
134 * x86 detection functions *
136 ******************************************************************************/
138 /*! \brief execute x86 cpuid instructions with custom level and extended level
140 * \param level The main cpuid level (input argument for eax register)
141 * \param ecxval Extended level (input argument for ecx register)
142 * \param eax Output in eax register
143 * \param ebx Output in ebx register
144 * \param ecx Output in ecx register
145 * \param edx Output in edx register
147 * \return 0 on success, or non-zero if the instruction could not execute.
150 executeX86CpuID(unsigned int gmx_unused level
,
151 unsigned int gmx_unused ecxval
,
157 if (c_architecture
== Architecture::X86
)
159 #if defined __GNUC__ || GMX_X86_GCC_INLINE_ASM
161 // any compiler that understands gcc inline assembly
167 # if GMX_IS_X86_32 && defined(__PIC__)
168 // Avoid clobbering the global offset table in 32-bit pic code (ebx register)
169 __asm__
__volatile__ ("xchgl %%ebx, %1 \n\t"
171 "xchgl %%ebx, %1 \n\t"
172 : "+a" (*eax
), "+r" (*ebx
), "+c" (*ecx
), "+d" (*edx
));
174 // i386 without PIC, or x86-64. Things are easy and we can clobber any reg we want
175 __asm__
__volatile__ ("cpuid \n\t"
176 : "+a" (*eax
), "+b" (*ebx
), "+c" (*ecx
), "+d" (*edx
));
178 // Not a normal x86, which could happen when a compiler
179 // targetting non-x86 pretends to be GCC.
183 #elif defined _MSC_VER
185 // MSVC (and icc on windows) on ia32 or x86-64
187 __cpuidex(cpuInfo
, level
, ecxval
);
188 *eax
= static_cast<unsigned int>(cpuInfo
[0]);
189 *ebx
= static_cast<unsigned int>(cpuInfo
[1]);
190 *ecx
= static_cast<unsigned int>(cpuInfo
[2]);
191 *edx
= static_cast<unsigned int>(cpuInfo
[3]);
196 // We are on x86, but without compiler support for cpuid if we get here
203 #endif // check for inline asm on x86
217 /*! \brief Detect x86 vendors by using the cpuid assembly instructions
219 * If support for the cpuid instruction is present, we check for Intel
222 * \return gmx::CpuInfo::Vendor::Intel, gmx::CpuInfo::Vendor::Amd. If neither
223 * Intel nor Amd can be identified, or if the code fails to execute,
224 * gmx::CpuInfo::Vendor::Unknown is returned.
229 unsigned int eax
, ebx
, ecx
, edx
;
230 CpuInfo::Vendor v
= CpuInfo::Vendor::Unknown
;
232 if (executeX86CpuID(0x0, 0, &eax
, &ebx
, &ecx
, &edx
) == 0)
234 if (ebx
== 0x756e6547 && ecx
== 0x6c65746e && edx
== 0x49656e69)
236 v
= CpuInfo::Vendor::Intel
; // ebx=='uneG', ecx=='letn', edx=='Ieni'
238 else if (ebx
== 0x68747541 && ecx
== 0x444d4163 && edx
== 0x69746e65)
240 v
= CpuInfo::Vendor::Amd
; // ebx=='htuA', ecx=='DMAc', edx=='itne'
246 /*! \brief Simple utility function to set/clear feature in a set
248 * \param featureSet Pointer to the feature set to update
249 * \param feature The specific feature to set/clear
250 * \param registerValue Register value (returned from cpuid)
251 * \param bit Bit to check in registerValue. The feature will be
252 * added to the featureSet if this bit is set.
254 * \note Nothing is done if the bit is not set. In particular, this will not
255 * erase anything if the feature already exists in the set.
258 setFeatureFromBit(std::set
<CpuInfo::Feature
> * featureSet
,
259 CpuInfo::Feature feature
,
260 unsigned int registerValue
,
263 if (registerValue
& (1 << bit
))
265 featureSet
->insert(feature
);
269 /*! \brief Process x86 cpuinfo features that are common to Intel and AMD CPUs
271 * \param[out] brand String where to write the x86 brand string
272 * \param[out] family Major version of processor
273 * \param[out] model Middle version of processor
274 * \param[out] stepping Minor version of processor
275 * \param[out] features Feature set where supported features are inserted
278 detectX86Features(std::string
* brand
,
282 std::set
<CpuInfo::Feature
> * features
)
284 unsigned int eax
, ebx
, ecx
, edx
;
286 // Return if we cannot execute any levels
287 if (executeX86CpuID(0x0, 0, &eax
, &ebx
, &ecx
, &edx
) != 0)
291 unsigned int maxStdLevel
= eax
;
293 if (maxStdLevel
>= 0x1)
295 executeX86CpuID(0x1, 0, &eax
, &ebx
, &ecx
, &edx
);
297 *family
= ((eax
& 0x0ff00000) >> 20) + ((eax
& 0x00000f00) >> 8);
298 *model
= ((eax
& 0x000f0000) >> 12) + ((eax
& 0x000000f0) >> 4);
299 *stepping
= (eax
& 0x0000000f);
301 setFeatureFromBit(features
, CpuInfo::Feature::X86_Sse3
, ecx
, 0 );
302 setFeatureFromBit(features
, CpuInfo::Feature::X86_Pclmuldq
, ecx
, 1 );
303 setFeatureFromBit(features
, CpuInfo::Feature::X86_Ssse3
, ecx
, 9 );
304 setFeatureFromBit(features
, CpuInfo::Feature::X86_Fma
, ecx
, 12 );
305 setFeatureFromBit(features
, CpuInfo::Feature::X86_Cx16
, ecx
, 13 );
306 setFeatureFromBit(features
, CpuInfo::Feature::X86_Pdcm
, ecx
, 15 );
307 setFeatureFromBit(features
, CpuInfo::Feature::X86_Pcid
, ecx
, 17 );
308 setFeatureFromBit(features
, CpuInfo::Feature::X86_Sse4_1
, ecx
, 19 );
309 setFeatureFromBit(features
, CpuInfo::Feature::X86_Sse4_2
, ecx
, 20 );
310 setFeatureFromBit(features
, CpuInfo::Feature::X86_X2Apic
, ecx
, 21 );
311 setFeatureFromBit(features
, CpuInfo::Feature::X86_Popcnt
, ecx
, 23 );
312 setFeatureFromBit(features
, CpuInfo::Feature::X86_Tdt
, ecx
, 24 );
313 setFeatureFromBit(features
, CpuInfo::Feature::X86_Aes
, ecx
, 25 );
314 setFeatureFromBit(features
, CpuInfo::Feature::X86_Avx
, ecx
, 28 );
315 setFeatureFromBit(features
, CpuInfo::Feature::X86_F16C
, ecx
, 29 );
316 setFeatureFromBit(features
, CpuInfo::Feature::X86_Rdrnd
, ecx
, 30 );
318 setFeatureFromBit(features
, CpuInfo::Feature::X86_Pse
, edx
, 3 );
319 setFeatureFromBit(features
, CpuInfo::Feature::X86_Msr
, edx
, 5 );
320 setFeatureFromBit(features
, CpuInfo::Feature::X86_Cx8
, edx
, 8 );
321 setFeatureFromBit(features
, CpuInfo::Feature::X86_Apic
, edx
, 9 );
322 setFeatureFromBit(features
, CpuInfo::Feature::X86_Cmov
, edx
, 15 );
323 setFeatureFromBit(features
, CpuInfo::Feature::X86_Clfsh
, edx
, 19 );
324 setFeatureFromBit(features
, CpuInfo::Feature::X86_Mmx
, edx
, 23 );
325 setFeatureFromBit(features
, CpuInfo::Feature::X86_Sse2
, edx
, 26 );
326 setFeatureFromBit(features
, CpuInfo::Feature::X86_Htt
, edx
, 28 );
329 if (maxStdLevel
>= 0x7)
331 executeX86CpuID(0x7, 0, &eax
, &ebx
, &ecx
, &edx
);
333 setFeatureFromBit(features
, CpuInfo::Feature::X86_Hle
, ebx
, 4 );
334 setFeatureFromBit(features
, CpuInfo::Feature::X86_Avx2
, ebx
, 5 );
335 setFeatureFromBit(features
, CpuInfo::Feature::X86_Rtm
, ebx
, 11 );
336 setFeatureFromBit(features
, CpuInfo::Feature::X86_Avx512F
, ebx
, 16 );
337 setFeatureFromBit(features
, CpuInfo::Feature::X86_Avx512PF
, ebx
, 26 );
338 setFeatureFromBit(features
, CpuInfo::Feature::X86_Avx512ER
, ebx
, 27 );
339 setFeatureFromBit(features
, CpuInfo::Feature::X86_Avx512CD
, ebx
, 28 );
340 setFeatureFromBit(features
, CpuInfo::Feature::X86_Sha
, ebx
, 29 );
341 setFeatureFromBit(features
, CpuInfo::Feature::X86_Avx512BW
, ebx
, 30 );
342 setFeatureFromBit(features
, CpuInfo::Feature::X86_Avx512VL
, ebx
, 31 );
345 // Check whether Hyper-threading is really possible to enable in the hardware,
346 // not just technically supported by this generation of processors
347 if (features
->count(CpuInfo::Feature::X86_Htt
) && maxStdLevel
>= 0x4)
349 executeX86CpuID(0x1, 0, &eax
, &ebx
, &ecx
, &edx
);
350 unsigned int maxLogicalCores
= (ebx
>> 16) & 0x0ff;
351 executeX86CpuID(0x4, 0, &eax
, &ebx
, &ecx
, &edx
);
352 unsigned int maxPhysicalCores
= ((eax
>> 26) & 0x3f) + 1;
353 if (maxLogicalCores
/maxPhysicalCores
< 2)
355 features
->erase(CpuInfo::Feature::X86_Htt
);
359 if (executeX86CpuID(0x80000000, 0, &eax
, &ebx
, &ecx
, &edx
) != 0)
361 // No point in continuing if we don't support any extended levels
364 unsigned int maxExtLevel
= eax
;
366 if (maxExtLevel
>= 0x80000001)
368 executeX86CpuID(0x80000001, 0, &eax
, &ebx
, &ecx
, &edx
);
370 setFeatureFromBit(features
, CpuInfo::Feature::X86_Lahf
, ecx
, 0 );
371 setFeatureFromBit(features
, CpuInfo::Feature::X86_Sse4A
, ecx
, 6 );
372 setFeatureFromBit(features
, CpuInfo::Feature::X86_MisalignSse
, ecx
, 7 );
373 setFeatureFromBit(features
, CpuInfo::Feature::X86_Xop
, ecx
, 11 );
374 setFeatureFromBit(features
, CpuInfo::Feature::X86_Fma4
, ecx
, 16 );
375 setFeatureFromBit(features
, CpuInfo::Feature::X86_PDPE1GB
, edx
, 26 );
376 setFeatureFromBit(features
, CpuInfo::Feature::X86_Rdtscp
, edx
, 27 );
379 if (maxExtLevel
>= 0x80000005)
381 // Get the x86 CPU brand string (3 levels, 16 bytes in each)
383 for (unsigned int level
= 0x80000002; level
< 0x80000005; level
++)
385 executeX86CpuID(level
, 0, &eax
, &ebx
, &ecx
, &edx
);
386 // Add eax, ebx, ecx, edx contents as 4 chars each to the brand string
387 brand
->append(reinterpret_cast<const char *>(&eax
), sizeof(eax
));
388 brand
->append(reinterpret_cast<const char *>(&ebx
), sizeof(ebx
));
389 brand
->append(reinterpret_cast<const char *>(&ecx
), sizeof(ecx
));
390 brand
->append(reinterpret_cast<const char *>(&edx
), sizeof(edx
));
395 if (maxExtLevel
>= 0x80000007)
397 executeX86CpuID(0x80000007, 0, &eax
, &ebx
, &ecx
, &edx
);
399 setFeatureFromBit(features
, CpuInfo::Feature::X86_NonstopTsc
, edx
, 8 );
404 /*! \brief Return a vector with x86 APIC IDs for all threads
406 * \param haveX2Apic True if the processors supports x2APIC, otherwise vanilla APIC.
408 * \returns A new std::vector of unsigned integer APIC IDs, one for each
409 * logical processor in the system.
411 const std::vector
<unsigned int>
412 detectX86ApicIDs(bool gmx_unused haveX2Apic
)
414 std::vector
<unsigned int> apicID
;
416 // We cannot just ask for all APIC IDs, but must force execution on each
417 // hardware thread and extract the APIC id there.
418 #if HAVE_SCHED_AFFINITY && defined HAVE_SYSCONF
419 unsigned int eax
, ebx
, ecx
, edx
;
420 unsigned int nApic
= sysconf(_SC_NPROCESSORS_ONLN
);
421 cpu_set_t saveCpuSet
;
423 sched_getaffinity(0, sizeof(cpu_set_t
), &saveCpuSet
);
425 for (unsigned int i
= 0; i
< nApic
; i
++)
428 sched_setaffinity(0, sizeof(cpu_set_t
), &cpuSet
);
431 executeX86CpuID(0xb, 0, &eax
, &ebx
, &ecx
, &edx
);
432 apicID
.push_back(edx
);
436 executeX86CpuID(0x1, 0, &eax
, &ebx
, &ecx
, &edx
);
437 apicID
.push_back(ebx
>> 24);
441 sched_setaffinity(0, sizeof(cpu_set_t
), &saveCpuSet
);
442 #elif GMX_NATIVE_WINDOWS
443 unsigned int eax
, ebx
, ecx
, edx
;
445 GetSystemInfo( &sysinfo
);
446 unsigned int nApic
= sysinfo
.dwNumberOfProcessors
;
447 unsigned int saveAffinity
= SetThreadAffinityMask(GetCurrentThread(), 1);
448 for (DWORD_PTR i
= 0; i
< nApic
; i
++)
450 SetThreadAffinityMask(GetCurrentThread(), (((DWORD_PTR
)1)<<i
));
454 executeX86CpuID(0xb, 0, &eax
, &ebx
, &ecx
, &edx
);
455 apicID
.push_back(edx
);
459 executeX86CpuID(0x1, 0, &eax
, &ebx
, &ecx
, &edx
);
460 apicID
.push_back(ebx
>> 24);
463 SetThreadAffinityMask(GetCurrentThread(), saveAffinity
);
469 /*! \brief Utility to renumber indices extracted from APIC IDs
471 * \param v Vector with unsigned integer indices
473 * This routine returns the number of unique different elements found in the vector,
474 * and renumbers these starting from 0. For example, the vector {0,1,2,8,9,10,8,9,10,0,1,2}
475 * will be rewritten to {0,1,2,3,4,5,3,4,5,0,1,2}, and it returns 6 for the
476 * number of unique elements.
479 renumberIndex(std::vector
<unsigned int> * v
)
481 std::vector
<unsigned int> sortedV (*v
);
482 std::sort(sortedV
.begin(), sortedV
.end());
484 std::vector
<unsigned int> uniqueSortedV (sortedV
);
485 auto it
= std::unique(uniqueSortedV
.begin(), uniqueSortedV
.end());
486 uniqueSortedV
.resize( std::distance(uniqueSortedV
.begin(), it
) );
488 for (std::size_t i
= 0; i
< uniqueSortedV
.size(); i
++)
490 unsigned int val
= uniqueSortedV
[i
];
491 std::replace_if(v
->begin(), v
->end(), [val
](unsigned int &c
) -> bool { return c
== val
; }, static_cast<unsigned int>(i
));
495 /*! \brief The layout of the bits in the APIC ID */
498 unsigned int hwThreadBits
; //!< The number of least significant bits for hw-threads
499 unsigned int coreBits
; //!< The number of core bits following the hw-thread bits
502 /*! \brief Detect the APIC ID layout for x2APIC
505 detectX2ApicIdLayout()
513 executeX86CpuID(0xb, 0, &eax
, &ebx
, &ecx
, &edx
);
514 layout
.hwThreadBits
= eax
& 0x1f;
515 executeX86CpuID(0xb, 1, &eax
, &ebx
, &ecx
, &edx
);
516 layout
.coreBits
= (eax
& 0x1f) - layout
.hwThreadBits
;
521 /*! \brief Detect the APIC ID layout for standard APIC or xAPIC on AMD
523 * \param[in] maxExtLevel The largest CPUID extended function input value supported by the processor implementation
526 detectAmdApicIdLayout(unsigned int maxExtLevel
)
534 executeX86CpuID(0x1, 0, &eax
, &ebx
, &ecx
, &edx
);
535 int family
= ((eax
& 0x0ff00000) >> 20) + ((eax
& 0x00000f00) >> 8);
536 executeX86CpuID(0x80000001, 0, &eax
, &ebx
, &ecx
, &edx
);
537 bool haveExtendedTopology
= (ecx
& (1 << 22));
539 // NOTE: Here we assume 1 thread per core, unless we have family >= 17h
540 layout
.hwThreadBits
= 0;
541 if (family
>= 0x17 &&
542 haveExtendedTopology
&&
543 maxExtLevel
>= 0x8000001e)
545 executeX86CpuID(0x8000001e, 1, &eax
, &ebx
, &ecx
, &edx
);
546 int numThreadsPerCore
= ((ebx
>> 8) & 0xff) + 1;
547 // NOTE: The AMD documentation only specifies the layout of apicid
548 // when we have 1 or 2 threads per core.
549 while (numThreadsPerCore
> (1 << layout
.hwThreadBits
))
551 layout
.hwThreadBits
++;
555 // Get number of core bits in apic ID - try modern extended method first
556 executeX86CpuID(0x80000008, 0, &eax
, &ebx
, &ecx
, &edx
);
557 layout
.coreBits
= (ecx
>> 12) & 0xf;
558 if (layout
.coreBits
== 0)
560 // Legacy method for old single/dual core AMD CPUs
562 while (i
>> layout
.coreBits
)
571 /*! \brief Try to detect basic CPU topology information using x86 cpuid
573 * If x2APIC support is present, this is our first choice, otherwise we
574 * attempt to use old vanilla APIC.
576 * \return A new vector of entries with socket, core, hwthread information
577 * for each logical processor.
579 std::vector
<CpuInfo::LogicalProcessor
>
580 detectX86LogicalProcessors()
586 unsigned int maxStdLevel
;
587 unsigned int maxExtLevel
;
591 std::vector
<CpuInfo::LogicalProcessor
> logicalProcessors
;
593 // Find largest standard & extended level input values allowed
594 executeX86CpuID(0x0, 0, &eax
, &ebx
, &ecx
, &edx
);
596 executeX86CpuID(0x80000000, 0, &eax
, &ebx
, &ecx
, &edx
);
599 if (maxStdLevel
>= 0x1)
601 executeX86CpuID(0x1, 0, &eax
, &ebx
, &ecx
, &edx
);
602 haveX2Apic
= (ecx
& (1 << 21)) && maxStdLevel
>= 0xb;
603 haveApic
= (edx
& (1 << 9)) && maxExtLevel
>= 0x80000008;
611 if (haveX2Apic
|| haveApic
)
614 // Get bits for cores and hardware threads
617 layout
= detectX2ApicIdLayout();
621 if (detectX86Vendor() == CpuInfo::Vendor::Amd
)
623 layout
= detectAmdApicIdLayout(maxExtLevel
);
625 if (layout
.hwThreadBits
> 1)
627 // At the time of writing this code we do not know what
628 // to do with more than 2 threads, so return empty.
629 return logicalProcessors
;
634 // We do not know the APIC ID layout, return empty.
635 return logicalProcessors
;
639 std::vector
<unsigned int> apicID
= detectX86ApicIDs(haveX2Apic
);
643 // APIC IDs can be buggy, and it is always a mess. Typically more bits are
644 // reserved than needed, and the numbers might not increment by 1 even in
645 // a single socket or core. Extract, renumber, and check that things make sense.
646 unsigned int hwThreadMask
= (1 << layout
.hwThreadBits
) - 1;
647 unsigned int coreMask
= (1 << layout
.coreBits
) - 1;
648 std::vector
<unsigned int> hwThreadRanks
;
649 std::vector
<unsigned int> coreRanks
;
650 std::vector
<unsigned int> socketRanks
;
652 for (auto a
: apicID
)
654 hwThreadRanks
.push_back( static_cast<int>( a
& hwThreadMask
) );
655 coreRanks
.push_back( static_cast<int>( ( a
>> layout
.hwThreadBits
) & coreMask
) );
656 socketRanks
.push_back( static_cast<int>( a
>> ( layout
.coreBits
+ layout
.hwThreadBits
) ) );
659 renumberIndex(&hwThreadRanks
);
660 renumberIndex(&coreRanks
);
661 renumberIndex(&socketRanks
);
663 unsigned int hwThreadRankSize
= 1 + *std::max_element(hwThreadRanks
.begin(), hwThreadRanks
.end());
664 unsigned int coreRankSize
= 1 + *std::max_element(coreRanks
.begin(), coreRanks
.end());
665 unsigned int socketRankSize
= 1 + *std::max_element(socketRanks
.begin(), socketRanks
.end());
667 if (socketRankSize
* coreRankSize
* hwThreadRankSize
== apicID
.size() )
669 // Alright, everything looks consistent, so put it in the result
670 for (std::size_t i
= 0; i
< apicID
.size(); i
++)
672 // While the internal APIC IDs are always unsigned integers, we also cast to
673 // plain integers for the externally exposed vectors, since that will make
674 // it possible to use '-1' for invalid entries in the future.
675 logicalProcessors
.push_back( { int(socketRanks
[i
]), int(coreRanks
[i
]), int(hwThreadRanks
[i
]) } );
680 return logicalProcessors
; // Will only have contents if everything worked
684 /******************************************************************************
686 * Generic Linux detection by parsing /proc/cpuinfo *
688 ******************************************************************************/
690 /*! \brief Parse /proc/cpuinfo into a simple string map
692 * This routine will read the contents of /proc/cpuinfo, and for each
693 * line that is not empty we will assign the (trimmed) string to the right of
694 * the colon as a key, and the left-hand side as the value in the map.
695 * For multi-processor systems where lines are repeated the latter lines will
696 * overwrite the first occurrence.
698 * \return New map with the contents. If the file is not available, the returned
701 const std::map
<std::string
, std::string
>
704 std::ifstream
procCpuInfo("/proc/cpuinfo");
706 std::map
<std::string
, std::string
> cpuInfo
;
708 while (std::getline(procCpuInfo
, line
))
712 std::stringstream
iss(line
);
715 std::getline(iss
, key
, ':'); // part before colon
716 std::getline(iss
, val
); // part after colon
719 // put it in the map. This will overwrite previous processors, but we don't care.
727 /*! \brief Try to detect vendor from /proc/cpuinfo
729 * \param cpuInfo Map returned from parseProcCpuinfo()
731 * This routine tries to match a few common labels in /proc/cpuinfo to see if
732 * they begin with the name of a standard vendor. If the file cannot be read
733 * or if no match is found, we return gmx::CpuInfo::Vendor::Unknown.
736 detectProcCpuInfoVendor(const std::map
<std::string
, std::string
> &cpuInfo
)
738 const std::map
<std::string
, CpuInfo::Vendor
> testVendors
=
740 { "GenuineIntel", CpuInfo::Vendor::Intel
},
741 { "Intel", CpuInfo::Vendor::Intel
},
742 { "AuthenticAmd", CpuInfo::Vendor::Amd
},
743 { "AMD", CpuInfo::Vendor::Amd
},
744 { "ARM", CpuInfo::Vendor::Arm
},
745 { "AArch64", CpuInfo::Vendor::Arm
},
746 { "Fujitsu", CpuInfo::Vendor::Fujitsu
},
747 { "IBM", CpuInfo::Vendor::Ibm
},
748 { "POWER", CpuInfo::Vendor::Ibm
},
749 { "Oracle", CpuInfo::Vendor::Oracle
},
752 // For each label in /proc/cpuinfo, compare the value to the name in the
753 // testNames map above, and if it's a match return the vendor.
754 for (auto &l
: { "vendor_id", "vendor", "manufacture", "model", "processor", "cpu" })
756 if (cpuInfo
.count(l
))
758 // there was a line with this left-hand side in /proc/cpuinfo
759 const std::string
&s1
= cpuInfo
.at(l
);
761 for (auto &t
: testVendors
)
763 const std::string
&s2
= t
.first
;
765 // If the entire name we are testing (s2) matches the first part of
766 // the string after the colon in /proc/cpuinfo (s1) we found our vendor
767 if (std::equal(s2
.begin(), s2
.end(), s1
.begin(),
768 [](const char &x
, const char &y
) -> bool { return tolower(x
) == tolower(y
); }))
775 return CpuInfo::Vendor::Unknown
;
779 /*! \brief Detect IBM processor name and features from /proc/cpuinfo
781 * \param cpuInfo Map returned from parseProcCpuinfo()
782 * \param[out] brand String where to write the brand string
783 * \param[out] features Feature set where supported features are inserted
785 * This routine tries to match a few common labels in /proc/cpuinfo to see if
786 * we can find the processor name and features. It is likely fragile.
789 detectProcCpuInfoIbm(const std::map
<std::string
, std::string
> &cpuInfo
,
791 std::set
<CpuInfo::Feature
> * features
)
793 // Get brand string from 'cpu' label if present, otherwise 'Processor'
794 if (cpuInfo
.count("cpu"))
796 *brand
= cpuInfo
.at("cpu");
798 else if (cpuInfo
.count("Processor"))
800 *brand
= cpuInfo
.at("Processor");
803 if (brand
->find("A2") != std::string::npos
)
805 // If the processor identification contains "A2", this is BlueGene/Q with QPX
806 features
->insert(CpuInfo::Feature::Ibm_Qpx
);
809 for (auto &l
: { "model name", "model", "Processor", "cpu" })
811 if (cpuInfo
.count(l
))
813 std::string s1
= cpuInfo
.at(l
);
814 std::transform(s1
.begin(), s1
.end(), s1
.begin(), ::tolower
);
816 if (s1
.find("altivec") != std::string::npos
)
818 features
->insert(CpuInfo::Feature::Ibm_Vmx
);
819 // If this is a power6, we only have VMX. All later processors have VSX.
820 if (s1
.find("power6") == std::string::npos
)
822 features
->insert(CpuInfo::Feature::Ibm_Vsx
);
830 /*! \brief Detect ARM processor name and features from /proc/cpuinfo
832 * \param cpuInfo Map returned from parseProcCpuinfo()
833 * \param[out] brand String where to write the brand string
834 * \param[out] family Major version of processor
835 * \param[out] model Middle version of processor
836 * \param[out] stepping Minor version of processor
837 * \param[out] features Feature set where supported features are inserted
839 * This routine tries to match a few common labels in /proc/cpuinfo to see if
840 * we can find the processor name and features. It is likely fragile.
843 detectProcCpuInfoArm(const std::map
<std::string
, std::string
> &cpuInfo
,
848 std::set
<CpuInfo::Feature
> * features
)
850 if (cpuInfo
.count("Processor"))
852 *brand
= cpuInfo
.at("Processor");
854 else if (cpuInfo
.count("model name"))
856 *brand
= cpuInfo
.at("model name");
859 if (cpuInfo
.count("CPU architecture"))
861 *family
= std::strtol(cpuInfo
.at("CPU architecture").c_str(), nullptr, 10);
862 // For some 64-bit CPUs it appears to say 'AArch64' instead
863 if (*family
== 0 && cpuInfo
.at("CPU architecture").find("AArch64") != std::string::npos
)
865 *family
= 8; // fragile - no idea how a future ARMv9 will be represented in this case
868 if (cpuInfo
.count("CPU variant"))
870 *model
= std::strtol(cpuInfo
.at("CPU variant").c_str(), nullptr, 16);
872 if (cpuInfo
.count("CPU revision"))
874 *stepping
= std::strtol(cpuInfo
.at("CPU revision").c_str(), nullptr, 10);
877 if (cpuInfo
.count("Features"))
879 const std::string
&s
= cpuInfo
.at("Features");
880 if (s
.find("neon") != std::string::npos
)
882 features
->insert(CpuInfo::Feature::Arm_Neon
);
884 if (s
.find("asimd") != std::string::npos
)
886 // At least Jetson TX1 runs a 32-bit environment by default, although
887 // the kernel is 64-bits, and reports asimd feature flags. We cannot
888 // use Neon-asimd in this case, so make sure we are on a 64-bit platform.
889 if (sizeof(void *) == 8)
891 features
->insert(CpuInfo::Feature::Arm_NeonAsimd
);
898 /*! \brief Try to detect vendor, cpu and features from /proc/cpuinfo
900 * \param[out] vendor Detected hardware vendor
901 * \param[out] brand String where to write the brand string
902 * \param[out] family Major version of processor
903 * \param[out] model Middle version of processor
904 * \param[out] stepping Minor version of processor
905 * \param[out] features Feature set where supported features are inserted
907 * This routine reads the /proc/cpuinfo file into a map and calls subroutines
908 * that attempt to parse by matching keys and values to known strings. It is
909 * much more fragile than our x86 detection, but it does not depend on
910 * specific system calls, intrinsics or assembly instructions.
913 detectProcCpuInfo(CpuInfo::Vendor
* vendor
,
918 std::set
<CpuInfo::Feature
> * features
)
920 std::map
<std::string
, std::string
> cpuInfo
= parseProcCpuInfo();
922 if (*vendor
== CpuInfo::Vendor::Unknown
)
924 *vendor
= detectProcCpuInfoVendor(cpuInfo
);
927 // Unfortunately there is no standard for contents in /proc/cpuinfo. We cannot
928 // indiscriminately look for e.g. 'cpu' since it could be either name or an index.
929 // To handle this slightly better we use one subroutine per vendor.
932 case CpuInfo::Vendor::Ibm
:
933 detectProcCpuInfoIbm(cpuInfo
, brand
, features
);
936 case CpuInfo::Vendor::Arm
:
937 detectProcCpuInfoArm(cpuInfo
, brand
, family
, model
, stepping
, features
);
941 // We only have a single check for fujitsu for now
943 features
->insert(CpuInfo::Feature::Fujitsu_HpcAce
);
949 } // namespace anonymous
953 CpuInfo
CpuInfo::detect()
957 if (c_architecture
== Architecture::X86
)
959 result
.vendor_
= detectX86Vendor();
961 if (result
.vendor_
== CpuInfo::Vendor::Intel
)
963 result
.features_
.insert(CpuInfo::Feature::X86_Intel
);
965 else if (result
.vendor_
== CpuInfo::Vendor::Amd
)
967 result
.features_
.insert(CpuInfo::Feature::X86_Amd
);
969 detectX86Features(&result
.brandString_
, &result
.family_
, &result
.model_
,
970 &result
.stepping_
, &result
.features_
);
971 result
.logicalProcessors_
= detectX86LogicalProcessors();
976 if (c_architecture
== Architecture::Arm
)
978 result
.vendor_
= CpuInfo::Vendor::Arm
;
980 else if (c_architecture
== Architecture::PowerPC
)
982 result
.vendor_
= CpuInfo::Vendor::Ibm
;
985 #if defined __aarch64__ || ( defined _M_ARM && _M_ARM >= 8 )
986 result
.features_
.insert(Feature::Arm_Neon
); // ARMv8 always has Neon
987 result
.features_
.insert(Feature::Arm_NeonAsimd
); // ARMv8 always has Neon-asimd
991 result
.vendor_
= CpuInfo::Vendor::Oracle
;
994 // On Linux we might be able to find information in /proc/cpuinfo. If vendor or brand
995 // is set to a known value this routine will not overwrite it.
996 detectProcCpuInfo(&result
.vendor_
, &result
.brandString_
, &result
.family_
,
997 &result
.model_
, &result
.stepping_
, &result
.features_
);
1000 if (!result
.logicalProcessors_
.empty())
1002 result
.supportLevel_
= CpuInfo::SupportLevel::LogicalProcessorInfo
;
1004 else if (!result
.features_
.empty())
1006 result
.supportLevel_
= CpuInfo::SupportLevel::Features
;
1008 else if (result
.vendor_
!= CpuInfo::Vendor::Unknown
1009 || result
.brandString_
!= "Unknown CPU brand")
1011 result
.supportLevel_
= CpuInfo::SupportLevel::Name
;
1015 result
.supportLevel_
= CpuInfo::SupportLevel::None
;
1023 : vendor_(CpuInfo::Vendor::Unknown
), brandString_("Unknown CPU brand"),
1024 family_(0), model_(0), stepping_(0)
1029 const std::map
<CpuInfo::Vendor
, std::string
>
1030 CpuInfo::s_vendorStrings_
=
1032 { CpuInfo::Vendor::Unknown
, "Unknown vendor" },
1033 { CpuInfo::Vendor::Intel
, "Intel" },
1034 { CpuInfo::Vendor::Amd
, "AMD" },
1035 { CpuInfo::Vendor::Fujitsu
, "Fujitsu" },
1036 { CpuInfo::Vendor::Ibm
, "IBM" },
1037 { CpuInfo::Vendor::Arm
, "ARM" },
1038 { CpuInfo::Vendor::Oracle
, "Oracle" },
1042 const std::map
<CpuInfo::Feature
, std::string
>
1043 CpuInfo::s_featureStrings_
=
1045 { CpuInfo::Feature::X86_Aes
, "aes" },
1046 { CpuInfo::Feature::X86_Amd
, "amd" },
1047 { CpuInfo::Feature::X86_Apic
, "apic" },
1048 { CpuInfo::Feature::X86_Avx
, "avx" },
1049 { CpuInfo::Feature::X86_Avx2
, "avx2" },
1050 { CpuInfo::Feature::X86_Avx512F
, "avx512f" },
1051 { CpuInfo::Feature::X86_Avx512PF
, "avx512pf" },
1052 { CpuInfo::Feature::X86_Avx512ER
, "avx512er" },
1053 { CpuInfo::Feature::X86_Avx512CD
, "avx512cd" },
1054 { CpuInfo::Feature::X86_Avx512BW
, "avx512bw" },
1055 { CpuInfo::Feature::X86_Avx512VL
, "avx512vl" },
1056 { CpuInfo::Feature::X86_Clfsh
, "clfsh" },
1057 { CpuInfo::Feature::X86_Cmov
, "cmov" },
1058 { CpuInfo::Feature::X86_Cx8
, "cx8" },
1059 { CpuInfo::Feature::X86_Cx16
, "cx16" },
1060 { CpuInfo::Feature::X86_F16C
, "f16c" },
1061 { CpuInfo::Feature::X86_Fma
, "fma" },
1062 { CpuInfo::Feature::X86_Fma4
, "fma4" },
1063 { CpuInfo::Feature::X86_Hle
, "hle" },
1064 { CpuInfo::Feature::X86_Htt
, "htt" },
1065 { CpuInfo::Feature::X86_Intel
, "intel" },
1066 { CpuInfo::Feature::X86_Lahf
, "lahf" },
1067 { CpuInfo::Feature::X86_MisalignSse
, "misalignsse" },
1068 { CpuInfo::Feature::X86_Mmx
, "mmx" },
1069 { CpuInfo::Feature::X86_Msr
, "msr" },
1070 { CpuInfo::Feature::X86_NonstopTsc
, "nonstop_tsc" },
1071 { CpuInfo::Feature::X86_Pcid
, "pcid" },
1072 { CpuInfo::Feature::X86_Pclmuldq
, "pclmuldq" },
1073 { CpuInfo::Feature::X86_Pdcm
, "pdcm" },
1074 { CpuInfo::Feature::X86_PDPE1GB
, "pdpe1gb" },
1075 { CpuInfo::Feature::X86_Popcnt
, "popcnt" },
1076 { CpuInfo::Feature::X86_Pse
, "pse" },
1077 { CpuInfo::Feature::X86_Rdrnd
, "rdrnd" },
1078 { CpuInfo::Feature::X86_Rdtscp
, "rdtscp" },
1079 { CpuInfo::Feature::X86_Rtm
, "rtm" },
1080 { CpuInfo::Feature::X86_Sha
, "sha" },
1081 { CpuInfo::Feature::X86_Sse2
, "sse2" },
1082 { CpuInfo::Feature::X86_Sse3
, "sse3" },
1083 { CpuInfo::Feature::X86_Sse4A
, "sse4a" },
1084 { CpuInfo::Feature::X86_Sse4_1
, "sse4.1" },
1085 { CpuInfo::Feature::X86_Sse4_2
, "sse4.2" },
1086 { CpuInfo::Feature::X86_Ssse3
, "ssse3" },
1087 { CpuInfo::Feature::X86_Tdt
, "tdt" },
1088 { CpuInfo::Feature::X86_X2Apic
, "x2apic" },
1089 { CpuInfo::Feature::X86_Xop
, "xop" },
1090 { CpuInfo::Feature::Arm_Neon
, "neon" },
1091 { CpuInfo::Feature::Arm_NeonAsimd
, "neon_asimd" },
1092 { CpuInfo::Feature::Ibm_Qpx
, "qpx" },
1093 { CpuInfo::Feature::Ibm_Vmx
, "vmx" },
1094 { CpuInfo::Feature::Ibm_Vsx
, "vsx" },
1095 { CpuInfo::Feature::Fujitsu_HpcAce
, "hpc-ace" }
1100 cpuIsX86Nehalem(const CpuInfo
&cpuInfo
)
1102 return (cpuInfo
.vendor() == gmx::CpuInfo::Vendor::Intel
&&
1103 cpuInfo
.family() == 6 &&
1104 (cpuInfo
.model() == 0x2E || cpuInfo
.model() == 0x1A ||
1105 cpuInfo
.model() == 0x1E || cpuInfo
.model() == 0x2F ||
1106 cpuInfo
.model() == 0x2C || cpuInfo
.model() == 0x25) );
1111 #ifdef GMX_CPUINFO_STANDALONE
1113 main(int argc
, char **argv
)
1118 "Usage:\n\n%s [flags]\n\n"
1119 "Available flags:\n"
1120 "-vendor Print CPU vendor.\n"
1121 "-brand Print CPU brand string.\n"
1122 "-family Print CPU family version.\n"
1123 "-model Print CPU model version.\n"
1124 "-stepping Print CPU stepping version.\n"
1125 "-features Print CPU feature flags.\n",
1130 std::string
arg(argv
[1]);
1131 gmx::CpuInfo
cpuInfo(gmx::CpuInfo::detect());
1133 if (arg
== "-vendor")
1135 printf("%s\n", cpuInfo
.vendorString().c_str());
1137 else if (arg
== "-brand")
1139 printf("%s\n", cpuInfo
.brandString().c_str());
1141 else if (arg
== "-family")
1143 printf("%d\n", cpuInfo
.family());
1145 else if (arg
== "-model")
1147 printf("%d\n", cpuInfo
.model());
1149 else if (arg
== "-stepping")
1151 printf("%d\n", cpuInfo
.stepping());
1153 else if (arg
== "-features")
1155 // Separate the feature strings with spaces. Note that in the
1156 // GROMACS cmake code, surrounding whitespace is first
1157 // stripped by the CPU detection routine, and then added back
1158 // in the code for making the SIMD suggestion.
1159 for (auto &f
: cpuInfo
.featureSet() )
1161 printf("%s ", cpuInfo
.featureString(f
).c_str());
1165 else if (arg
== "-topology")
1167 // Undocumented debug option, usually not present in standalone version
1168 for (auto &t
: cpuInfo
.logicalProcessors() )
1170 printf("%3u %3u %3u\n", t
.socketRankInMachine
, t
.coreRankInSocket
, t
.hwThreadRankInCore
);