From c6ce8efc35a5073d096aa2043136d0fc50d639a4 Mon Sep 17 00:00:00 2001 From: Erik Lindahl Date: Mon, 13 Nov 2017 10:48:05 -0700 Subject: [PATCH] Detect AVX-512 FMA units to choose best SIMD Add a test program that times AVX-512 code to detect single vs. dual AVX-512 FMA units. Added CMake code to always compile this file with AVX-512 flags, both at CMake configuration and runtime. Tested to work on both AVX2 and AVX-512 hardware with dual FMAs, and by manually faking single FMA units, but when we get access to hardware with a single AVX-512 FMA unit we need to check that we produce the correct result. Change-Id: I6240e864bc77f95085c5cd3303a84ab581eb3662 --- cmake/gmxDetectAvx512FmaUnits.cmake | 90 ++++++ cmake/gmxDetectSimd.cmake | 13 +- cmake/gmxManageSimd.cmake | 18 ++ cmake/gmxSimdFlags.cmake | 66 ++-- src/config.h.cmakein | 3 + src/gromacs/CMakeLists.txt | 5 + src/gromacs/hardware/CMakeLists.txt | 1 + src/gromacs/hardware/identifyavx512fmaunits.cpp | 383 ++++++++++++++++++++++++ src/gromacs/hardware/identifyavx512fmaunits.h | 57 ++++ src/gromacs/hardware/printhardware.cpp | 39 +-- src/gromacs/simd/support.cpp | 98 ++++-- 11 files changed, 695 insertions(+), 78 deletions(-) create mode 100644 cmake/gmxDetectAvx512FmaUnits.cmake create mode 100644 src/gromacs/hardware/identifyavx512fmaunits.cpp create mode 100644 src/gromacs/hardware/identifyavx512fmaunits.h diff --git a/cmake/gmxDetectAvx512FmaUnits.cmake b/cmake/gmxDetectAvx512FmaUnits.cmake new file mode 100644 index 0000000000..21120c37d5 --- /dev/null +++ b/cmake/gmxDetectAvx512FmaUnits.cmake @@ -0,0 +1,90 @@ +# +# This file is part of the GROMACS molecular simulation package. +# +# Copyright (c) 2017, by the GROMACS development team, led by +# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, +# and including many others, as listed in the AUTHORS file in the +# top-level source directory and at http://www.gromacs.org. +# +# GROMACS is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public License +# as published by the Free Software Foundation; either version 2.1 +# of the License, or (at your option) any later version. +# +# GROMACS is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with GROMACS; if not, see +# http://www.gnu.org/licenses, or write to the Free Software Foundation, +# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# If you want to redistribute modifications to GROMACS, please +# consider that scientific software is very special. Version +# control is crucial - bugs must be traceable. We will be happy to +# consider code for inclusion in the official distribution, but +# derived work must not be called official GROMACS. Details are found +# in the README & COPYING files - if they are missing, get the +# official version at http://www.gromacs.org. +# +# To help us fund GROMACS development, we humbly ask that you cite +# the research papers on the package. Check out http://www.gromacs.org. + +include(gmxSimdFlags) + +# gmx_detect_avx_512_fma_units() +# +# Try to detect whether the host has one or two AVX-512 FMA units +# by executing a small program. This will only work on hosts that +# support AVX-512. If successful it sets RESULT to 1 or 2 for the +# number of AVX-512 FMA units, and otherwise -1. +# +function(gmx_detect_avx_512_fma_units RESULT) + if(CMAKE_CROSSCOMPILING) + set(${RESULT} -1 CACHE INTERNAL "Result of test for number of AVX-512 FMA units") + else() + set(AVX_512_FMA_UNIT_DETECTION_BINARY "${PROJECT_BINARY_DIR}/CMakeFiles/GmxDetectAvx512FmaUnits${CMAKE_EXECUTABLE_SUFFIX}") + if(NOT AVX_512_FMA_UNIT_DETECTION_COMPILED) + + # Find flags required for AVX-512 + gmx_find_simd_avx_512_flags(SIMD_AVX_512_C_SUPPORTED SIMD_AVX_512_CXX_SUPPORTED + SIMD_AVX_512_C_FLAGS SIMD_AVX_512_CXX_FLAGS) + + if(${SIMD_AVX_512_CXX_SUPPORTED}) + # Compile the detection program + + set(_compile_definitions "-I${PROJECT_SOURCE_DIR}/src -DGMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE ${SIMD_AVX_512_CXX_FLAGS} ${GMX_STDLIB_CXX_FLAGS}") + try_compile(AVX_512_FMA_UNIT_DETECTION_COMPILED + "${PROJECT_BINARY_DIR}" + "${PROJECT_SOURCE_DIR}/src/gromacs/hardware/identifyavx512fmaunits.cpp" + COMPILE_DEFINITIONS "${_compile_definitions}" + LINK_LIBRARIES "${GMX_STDLIB_LIBRARIES}" + OUTPUT_VARIABLE AVX_512_FMA_UNIT_DETECTION_COMPILED_OUTPUT + COPY_FILE ${AVX_512_FMA_UNIT_DETECTION_BINARY}) + if(NOT AVX_512_FMA_UNIT_DETECTION_COMPILED AND NOT RUN_AVX_512_FMA_UNIT_DETECTION_COMPILATION_QUIETLY) + message(STATUS "Could not identify number of AVX-512 units - detection program did not compile") + endif() + set(RUN_AVX_512_FMA_UNIT_DETECTION_COMPILATION_QUIETLY TRUE CACHE INTERNAL "Keep quiet on any future compilation attempts") + endif() + + if(AVX_512_FMA_UNIT_DETECTION_COMPILED) + # Run the program + if(NOT DEFINED ${RESULT}) + execute_process(COMMAND ${AVX_512_FMA_UNIT_DETECTION_BINARY} + RESULT_VARIABLE RESULT_VAR + OUTPUT_VARIABLE OUTPUT_VAR_TEMP + ERROR_QUIET) + if (RESULT_VAR EQUAL 0) + string(STRIP "${OUTPUT_VAR_TEMP}" OUTPUT_VAR) + set(${RESULT} ${OUTPUT_VAR_TEMP} CACHE INTERNAL "Result of test for number of AVX-512 FMA units") + else() + message(STATUS "Could not identify number of AVX-512 units - detection program did run successfully") + set(${RESULT} -1 CACHE INTERNAL "Result of test for number of AVX-512 FMA units") + endif() + endif() + endif() + endif() + endif() +endfunction() diff --git a/cmake/gmxDetectSimd.cmake b/cmake/gmxDetectSimd.cmake index f1e5821a71..79987d2e8c 100644 --- a/cmake/gmxDetectSimd.cmake +++ b/cmake/gmxDetectSimd.cmake @@ -55,6 +55,8 @@ include(gmxDetectTargetArchitecture) gmx_detect_target_architecture() include(gmxDetectCpu) +include(gmxDetectAvx512FmaUnits) + function(gmx_suggest_simd _suggested_simd) if (NOT SUGGEST_SIMD_QUIETLY) message(STATUS "Detecting best SIMD instructions for this CPU") @@ -77,7 +79,16 @@ function(gmx_suggest_simd _suggested_simd) if(CPU_DETECTION_FEATURES MATCHES " avx512er ") set(OUTPUT_SIMD "AVX_512_KNL") elseif(CPU_DETECTION_FEATURES MATCHES " avx512f ") - set(OUTPUT_SIMD "AVX_512") + gmx_detect_avx_512_fma_units(NUMBER_OF_AVX_512_FMA_UNITS) + if(NUMBER_OF_AVX_512_FMA_UNITS EQUAL 2) + set(OUTPUT_SIMD "AVX_512") + elseif(NUMBER_OF_AVX_512_FMA_UNITS EQUAL 1) + message(STATUS "This host supports AVX-512, but only has 1 AVX-512 FMA unit, so AVX2 will be faster.") + set(OUTPUT_SIMD "AVX2_256") + else() + message(WARNING "Could not run code to detect number of AVX-512 FMA units - assuming 2.") + set(OUTPUT_SIMD "AVX_512") + endif() elseif(CPU_DETECTION_FEATURES MATCHES " avx2 ") if(CPU_DETECTION_FEATURES MATCHES " amd ") set(OUTPUT_SIMD "AVX2_128") diff --git a/cmake/gmxManageSimd.cmake b/cmake/gmxManageSimd.cmake index a0de5bda67..e592197d5c 100644 --- a/cmake/gmxManageSimd.cmake +++ b/cmake/gmxManageSimd.cmake @@ -314,6 +314,24 @@ if (SIMD_CHANGED AND DEFINED SIMD_STATUS_MESSAGE) message(STATUS "${SIMD_STATUS_MESSAGE}") endif() +# While AVX-512 is a more recent SIMD ISA than AVX2, some Intel CPUs only have +# a single AVX-512 FMA unit, but two AVX2 FMA units, and then it is better to +# use AVX2. The only way to test this is to execute a small timing loop. +# To be able to recommend the user whether s/he should try AVX-512 instead of +# AVX2, we need to compile a single file with AVX512 flags. We do this +# automatically, but this option provides a way to turn it off in case it +# breaks something. The actual test source file is built if +# SIMD_AVX_512_CXX_SUPPORTED is set, so it will always be included if we have +# GMX_SIMD=AVX_512. +set(GMX_ENABLE_AVX512_TESTS ON CACHE INTERNAL "Compile AVX512 code to test FMA units, even when not using AVX512 SIMD") +mark_as_advanced(GMX_ENABLE_AVX512_TESTS) + +if(GMX_ENABLE_AVX512_TESTS AND + (GMX_SIMD_ACTIVE STREQUAL "AVX_256" OR GMX_SIMD_ACTIVE STREQUAL "AVX2_256" OR GMX_SIMD_ACTIVE STREQUAL "AVX2_128")) + gmx_find_simd_avx_512_flags(SIMD_AVX_512_C_SUPPORTED SIMD_AVX_512_CXX_SUPPORTED + SIMD_AVX_512_C_FLAGS SIMD_AVX_512_CXX_FLAGS) +endif() + # By default, 32-bit windows cannot pass SIMD (SSE/AVX) arguments in registers, # and even on 64-bit (all platforms) it is only used for a handful of arguments. # The __vectorcall (MSVC, from MSVC2013) or __regcall (ICC) calling conventions diff --git a/cmake/gmxSimdFlags.cmake b/cmake/gmxSimdFlags.cmake index cbde917b54..441c65e693 100644 --- a/cmake/gmxSimdFlags.cmake +++ b/cmake/gmxSimdFlags.cmake @@ -91,13 +91,13 @@ function(gmx_find_simd_sse2_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VARIAB "-msse2" "/arch:SSE2" "-hgnu") if(${SIMD_SSE2_C_FLAGS_RESULT}) - set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_SSE2_C_FLAGS}" PARENT_SCOPE) + set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_SSE2_C_FLAGS}" CACHE INTERNAL "C flags required for SSE2 instructions") endif() if(${SIMD_SSE2_CXX_FLAGS_RESULT}) - set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_SSE2_CXX_FLAGS}" PARENT_SCOPE) + set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_SSE2_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for SSE2 instructions") endif() set(${C_FLAGS_RESULT} ${SIMD_SSE2_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE2 C flags" FORCE) - set(${CXX_FLAGS_RESULT} ${SIMD_SSE2_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE2 CXX flags" FORCE) + set(${CXX_FLAGS_RESULT} ${SIMD_SSE2_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE2 C++ flags" FORCE) endfunction() # SSE4.1 @@ -112,13 +112,13 @@ function(gmx_find_simd_sse4_1_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VARI "-msse4.1" "/arch:SSE4.1" "/arch:SSE2" "-hgnu") if(${SIMD_SSE4_1_C_FLAGS_RESULT}) - set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_SSE4_1_C_FLAGS}" PARENT_SCOPE) + set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_SSE4_1_C_FLAGS}" CACHE INTERNAL "C flags required for SSE4.1 instructions") endif() if(${SIMD_SSE4_1_CXX_FLAGS_RESULT}) - set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_SSE4_1_CXX_FLAGS}" PARENT_SCOPE) + set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_SSE4_1_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for SSE4.1 instructions") endif() set(${C_FLAGS_RESULT} ${SIMD_SSE4_1_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE4.1 C flags" FORCE) - set(${CXX_FLAGS_RESULT} ${SIMD_SSE4_1_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE4.1 CXX flags" FORCE) + set(${CXX_FLAGS_RESULT} ${SIMD_SSE4_1_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE4.1 C++ flags" FORCE) endfunction() # AVX, but using only 128-bit instructions and FMA (AMD XOP processors) @@ -202,13 +202,13 @@ function(gmx_find_simd_avx_128_fma_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS endif() if(${SIMD_AVX_128_FMA_C_FLAGS_RESULT}) - set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_GENERIC_C_FLAGS} ${SIMD_AVX_AMD_FMA_C_FLAGS} ${SIMD_AVX_XOP_C_FLAGS}" PARENT_SCOPE) + set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_GENERIC_C_FLAGS} ${SIMD_AVX_AMD_FMA_C_FLAGS} ${SIMD_AVX_XOP_C_FLAGS}" CACHE INTERNAL "C flags required for 128-bit AVX with AMD FMA instructions") endif() if(${SIMD_AVX_128_FMA_CXX_FLAGS_RESULT}) - set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_GENERIC_CXX_FLAGS} ${SIMD_AVX_AMD_FMA_CXX_FLAGS} ${SIMD_AVX_XOP_CXX_FLAGS}" PARENT_SCOPE) + set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_GENERIC_CXX_FLAGS} ${SIMD_AVX_AMD_FMA_CXX_FLAGS} ${SIMD_AVX_XOP_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for 128-bit AVX with AMD FMA instructions") endif() set(${C_FLAGS_RESULT} ${SIMD_AVX_128_FMA_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for 128-bit AVX with AMD FMA C flags" FORCE) - set(${CXX_FLAGS_RESULT} ${SIMD_AVX_128_FMA_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for 128-bit AVX with AMD FMA CXX flags" FORCE) + set(${CXX_FLAGS_RESULT} ${SIMD_AVX_128_FMA_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for 128-bit AVX with AMD FMA C++ flags" FORCE) endfunction() @@ -223,13 +223,13 @@ function(gmx_find_simd_avx_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VARIABL "-mavx" "/arch:AVX" "-hgnu") if(${SIMD_AVX_C_FLAGS_RESULT}) - set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_C_FLAGS}" PARENT_SCOPE) + set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_C_FLAGS}" CACHE INTERNAL "C flags required for AVX instructions") endif() if(${SIMD_AVX_CXX_FLAGS_RESULT}) - set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_CXX_FLAGS}" PARENT_SCOPE) + set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for AVX instructions") endif() set(${C_FLAGS_RESULT} ${SIMD_AVX_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX C flags" FORCE) - set(${CXX_FLAGS_RESULT} ${SIMD_AVX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX CXX flags" FORCE) + set(${CXX_FLAGS_RESULT} ${SIMD_AVX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX C++ flags" FORCE) endfunction() # AVX2 @@ -243,13 +243,13 @@ function(gmx_find_simd_avx2_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VARIAB "-march=core-avx2" "-mavx2" "/arch:AVX" "-hgnu") # no AVX2-specific flag for MSVC yet if(${SIMD_AVX2_C_FLAGS_RESULT}) - set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX2_C_FLAGS}" PARENT_SCOPE) + set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX2_C_FLAGS}" CACHE INTERNAL "C flags required for AVX2 instructions") endif() if(${SIMD_AVX2_CXX_FLAGS_RESULT}) - set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX2_CXX_FLAGS}" PARENT_SCOPE) + set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX2_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for AVX2 instructions") endif() set(${C_FLAGS_RESULT} ${SIMD_AVX2_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX2 C flags" FORCE) - set(${CXX_FLAGS_RESULT} ${SIMD_AVX2_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX2 CXX flags" FORCE) + set(${CXX_FLAGS_RESULT} ${SIMD_AVX2_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX2 C++ flags" FORCE) endfunction() @@ -265,13 +265,13 @@ function(gmx_find_simd_avx_512_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VAR "-xCORE-AVX512 -qopt-zmm-usage=high" "-xCORE-AVX512" "-mavx512f -mfma" "-mavx512f" "/arch:AVX" "-hgnu") # no AVX_512F flags known for MSVC yet. ICC should use ZMM if code anyhow uses ZMM if(${SIMD_AVX_512_C_FLAGS_RESULT}) - set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_512_C_FLAGS}" PARENT_SCOPE) + set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_512_C_FLAGS}" CACHE INTERNAL "C flags required for AVX-512 instructions") endif() if(${SIMD_AVX_512_CXX_FLAGS_RESULT}) - set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_512_CXX_FLAGS}" PARENT_SCOPE) + set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_512_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for AVX-512 instructions") endif() set(${C_FLAGS_RESULT} ${SIMD_AVX_512_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 C flags" FORCE) - set(${CXX_FLAGS_RESULT} ${SIMD_AVX_512_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 CXX flags" FORCE) + set(${CXX_FLAGS_RESULT} ${SIMD_AVX_512_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 C++ flags" FORCE) endfunction() @@ -287,13 +287,13 @@ function(gmx_find_simd_avx_512_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VAR "-xMIC-AVX512" "-mavx512er -mfma" "-mavx512er" "/arch:AVX" "-hgnu") # no AVX_512ER flags known for MSVC yet if(${SIMD_AVX_512_KNL_C_FLAGS_RESULT}) - set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_512_KNL_C_FLAGS}" PARENT_SCOPE) + set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_512_KNL_C_FLAGS}" CACHE INTERNAL "C flags required for AVX-512 for KNL instructions") endif() if(${SIMD_AVX_512_KNL_CXX_FLAGS_RESULT}) - set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_512_KNL_CXX_FLAGS}" PARENT_SCOPE) + set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_512_KNL_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for AVX-512 for KNL instructions") endif() set(${C_FLAGS_RESULT} ${SIMD_AVX_512_KNL_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 for KNL C flags" FORCE) - set(${CXX_FLAGS_RESULT} ${SIMD_AVX_512_KNL_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 for KNL CXX flags" FORCE) + set(${CXX_FLAGS_RESULT} ${SIMD_AVX_512_KNL_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 for KNL C++ flags" FORCE) endfunction() @@ -308,13 +308,13 @@ function(gmx_find_simd_arm_neon_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VA "-mfpu=neon-vfpv4" "-mfpu=neon" "") if(${SIMD_ARM_NEON_C_FLAGS_RESULT}) - set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_ARM_NEON_C_FLAGS}" PARENT_SCOPE) + set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_ARM_NEON_C_FLAGS}" CACHE INTERNAL "C flags required for Arm Neon instructions") endif() if(${SIMD_ARM_NEON_CXX_FLAGS_RESULT}) - set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_ARM_NEON_CXX_FLAGS}" PARENT_SCOPE) + set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_ARM_NEON_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for Arm Neon instructions") endif() set(${C_FLAGS_RESULT} ${SIMD_ARM_NEON_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon C flags" FORCE) - set(${CXX_FLAGS_RESULT} ${SIMD_ARM_NEON_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon CXX flags" FORCE) + set(${CXX_FLAGS_RESULT} ${SIMD_ARM_NEON_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon C++ flags" FORCE) endfunction() # Arm Neon Asimd (64-bit ARM) @@ -328,13 +328,13 @@ function(gmx_find_simd_arm_neon_asimd_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FL "") if(${SIMD_ARM_NEON_ASIMD_C_FLAGS_RESULT}) - set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_ARM_NEON_ASIMD_C_FLAGS}" PARENT_SCOPE) + set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_ARM_NEON_ASIMD_C_FLAGS}" CACHE INTERNAL "C flags required for Arm Neon Asimd instructions") endif() if(${SIMD_ARM_NEON_ASIMD_CXX_FLAGS_RESULT}) - set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_ARM_NEON_ASIMD_CXX_FLAGS}" PARENT_SCOPE) + set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_ARM_NEON_ASIMD_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for Arm Neon Asimd instructions") endif() set(${C_FLAGS_RESULT} ${SIMD_ARM_NEON_ASIMD_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon Asimd C flags" FORCE) - set(${CXX_FLAGS_RESULT} ${SIMD_ARM_NEON_ASIMD_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon Asimd CXX flags" FORCE) + set(${CXX_FLAGS_RESULT} ${SIMD_ARM_NEON_ASIMD_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon Asimd C++ flags" FORCE) endfunction() # IBM VMX (power6) @@ -348,13 +348,13 @@ function(gmx_find_simd_ibm_vmx_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VAR "-maltivec -mabi=altivec" "-qarch=auto -qaltivec") if(${SIMD_IBM_VMX_C_FLAGS_RESULT}) - set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_IBM_VMX_C_FLAGS}" PARENT_SCOPE) + set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_IBM_VMX_C_FLAGS}" CACHE INTERNAL "C flags required for IBM VMX instructions") endif() if(${SIMD_IBM_VMX_CXX_FLAGS_RESULT}) - set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_IBM_VMX_CXX_FLAGS}" PARENT_SCOPE) + set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_IBM_VMX_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for IBM VMX instructions") endif() set(${C_FLAGS_RESULT} ${SIMD_IBM_VMX_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VMX C flags" FORCE) - set(${CXX_FLAGS_RESULT} ${SIMD_IBM_VMX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VMX CXX flags" FORCE) + set(${CXX_FLAGS_RESULT} ${SIMD_IBM_VMX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VMX C++ flags" FORCE) endfunction() # IBM VSX (power7 and later) @@ -368,11 +368,11 @@ function(gmx_find_simd_ibm_vsx_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VAR "-mvsx" "-maltivec -mabi=altivec" "-qarch=auto -qaltivec") if(${SIMD_IBM_VSX_C_FLAGS_RESULT}) - set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_IBM_VSX_C_FLAGS}" PARENT_SCOPE) + set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_IBM_VSX_C_FLAGS}" CACHE INTERNAL "C flags required for IBM VSX instructions") endif() if(${SIMD_IBM_VSX_CXX_FLAGS_RESULT}) - set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_IBM_VSX_CXX_FLAGS}" PARENT_SCOPE) + set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_IBM_VSX_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for IBM VSX instructions") endif() set(${C_FLAGS_RESULT} ${SIMD_IBM_VSX_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VSX C flags" FORCE) - set(${CXX_FLAGS_RESULT} ${SIMD_IBM_VSX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VSX CXX flags" FORCE) + set(${CXX_FLAGS_RESULT} ${SIMD_IBM_VSX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VSX C++ flags" FORCE) endfunction() diff --git a/src/config.h.cmakein b/src/config.h.cmakein index b4f977b865..dfdb3f10d7 100644 --- a/src/config.h.cmakein +++ b/src/config.h.cmakein @@ -134,6 +134,9 @@ /* Target mantissa accuracy for SIMD double precision math */ #define GMX_SIMD_ACCURACY_BITS_DOUBLE @GMX_SIMD_ACCURACY_BITS_DOUBLE@ +/* Enable code that requires AVX-512 instruction support, without GMX_SIMD=AVX_512 */ +#cmakedefine01 SIMD_AVX_512_CXX_SUPPORTED + /* Whether a double-precision configuration may target accuracy equivalent to single precision */ #cmakedefine01 GMX_RELAXED_DOUBLE_PRECISION diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt index cf48a183b7..37d2e029d4 100644 --- a/src/gromacs/CMakeLists.txt +++ b/src/gromacs/CMakeLists.txt @@ -40,6 +40,7 @@ endif() set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES) set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS) +set_property(GLOBAL PROPERTY GMX_AVX_512_SOURCE) function (_gmx_add_files_to_property PROPERTY) foreach (_file ${ARGN}) @@ -191,6 +192,10 @@ if (HAS_NO_UNUSED) endif() set_source_files_properties(selection/scanner.cpp PROPERTIES COMPILE_FLAGS "${_scanner_cpp_compiler_flags}") +if(SIMD_AVX_512_CXX_SUPPORTED) + set_source_files_properties(hardware/identifyavx512fmaunits.cpp PROPERTIES COMPILE_FLAGS "${SIMD_AVX_512_CXX_FLAGS}") +endif() + gmx_setup_tng_for_libgromacs() target_link_libraries(libgromacs diff --git a/src/gromacs/hardware/CMakeLists.txt b/src/gromacs/hardware/CMakeLists.txt index c758f54374..6bc268c766 100644 --- a/src/gromacs/hardware/CMakeLists.txt +++ b/src/gromacs/hardware/CMakeLists.txt @@ -38,6 +38,7 @@ gmx_add_libgromacs_sources( gpu_hw_info.cpp hardwaretopology.cpp printhardware.cpp + identifyavx512fmaunits.cpp ) if (BUILD_TESTING) diff --git a/src/gromacs/hardware/identifyavx512fmaunits.cpp b/src/gromacs/hardware/identifyavx512fmaunits.cpp new file mode 100644 index 0000000000..93f1c308c9 --- /dev/null +++ b/src/gromacs/hardware/identifyavx512fmaunits.cpp @@ -0,0 +1,383 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Implements a routine to check the number of AVX512 fma units + * + * Just as the CpuInfo code, we need to be able to compile this file in stand-alone mode + * to set the SIMD acceleration and similar things during CMake configuration. + */ + +#ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE +#include "gmxpre.h" +#endif + +#include "identifyavx512fmaunits.h" + +#ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE +#define SIMD_AVX_512_CXX_SUPPORTED 1 +#else +#include "config.h" +#endif + +#if SIMD_AVX_512_CXX_SUPPORTED +#include + +#ifdef _MSC_VER +#include +#endif +#endif // SIMD_AVX_512_CXX_SUPPORTED + +#include +#include + +#include +#include + +#ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE +#include "gromacs/hardware/cpuinfo.h" +#endif + +namespace gmx +{ + +namespace +{ + +#if SIMD_AVX_512_CXX_SUPPORTED +// Use a local routine to read the timestep counter just on x86 to avoid dependence +// on the Gromacs cycle counter module. +uint64_t rdtscp(void) +{ +#ifdef MSC_VER + unsigned int ui; + return static_cast(__rdtscp(&ui)); +#else + uint32_t low; + uint32_t high; + + __asm__ __volatile__("rdtscp" : "=a" (low), "=d" (high) :: "ecx" ); + return (static_cast(high) << 32) | low; +#endif +} + +/*\ brief Loop over mixed FMA and shuffle AVX512 instructions + * + * This function executes a meaningless loop that includes both + * FMA and shuffle instructions from the AVX512 instruction set. + * We need a bit of complex logic to make sure it cannot be + * optimized away by the compiler. + * + * \param loopCount Number of iterations. Each iteration will + * execute 12 FMA and 12 shuffle instructions. + * \param seed A double-precision number between 0 and 1. + * To be really certain the loop is not optimized + * away, you should use some timing-related + * function to create this seed at runtime. + * \return Meaningless floating-point number. Make sure you + * add this number to some variable and conditionally + * issue a print statement e.g. if it is negative + * (which should not happen), again to make sure the loop + * cannot be optimized away. + */ +double +executeFmaAndShuffleLoop(int loopCount, + double seed) +{ + // Make sure all variables are different to avoid gcc optimizing them away + __m512d d0 = _mm512_set1_pd(1.0-0.01*seed); + __m512d d1 = _mm512_set1_pd(1.0-0.02*seed); + __m512d d2 = _mm512_set1_pd(1.0-0.03*seed); + __m512d d3 = _mm512_set1_pd(1.0-0.04*seed); + __m512d d4 = _mm512_set1_pd(1.0-0.05*seed); + __m512d d5 = _mm512_set1_pd(1.0-0.06*seed); + __m512d d6 = _mm512_set1_pd(1.0-0.07*seed); + __m512d d7 = _mm512_set1_pd(1.0-0.08*seed); + __m512d d8 = _mm512_set1_pd(1.0-0.09*seed); + __m512d d9 = _mm512_set1_pd(1.0-0.10*seed); + __m512d d10 = _mm512_set1_pd(1.0-0.11*seed); + __m512d d11 = _mm512_set1_pd(1.0-0.12*seed); + __m512d eps = _mm512_set1_pd(1e-6); + __m512i i0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __m512i i1 = _mm512_set_epi32(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + __m512i i2 = _mm512_set_epi32(1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2); + __m512i i3 = _mm512_set_epi32(2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3); + __m512i i4 = _mm512_set_epi32(3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4); + __m512i i5 = _mm512_set_epi32(4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5); + __m512i i6 = _mm512_set_epi32(5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6); + __m512i i7 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + __m512i i8 = _mm512_set_epi32(8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9); + __m512i i9 = _mm512_set_epi32(9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10); + __m512i i10 = _mm512_set_epi32(10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11); + __m512i i11 = _mm512_set_epi32(11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12); + __m512i idx = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13); + __mmask16 mask = static_cast(0xffff); + + for (int i = 0; i < loopCount; i++) + { + d0 = _mm512_fmadd_pd(d0, d0, eps); + d1 = _mm512_fmadd_pd(d1, d1, eps); + d2 = _mm512_fmadd_pd(d2, d2, eps); + d3 = _mm512_fmadd_pd(d3, d3, eps); + d4 = _mm512_fmadd_pd(d4, d4, eps); + d5 = _mm512_fmadd_pd(d5, d5, eps); + d6 = _mm512_fmadd_pd(d6, d6, eps); + d7 = _mm512_fmadd_pd(d7, d7, eps); + d8 = _mm512_fmadd_pd(d8, d8, eps); + d9 = _mm512_fmadd_pd(d9, d9, eps); + d10 = _mm512_fmadd_pd(d10, d10, eps); + d11 = _mm512_fmadd_pd(d11, d11, eps); + // plain permutevar is not yet available in gcc-6.4 + i0 = _mm512_maskz_permutexvar_epi32(mask, idx, i0); + i1 = _mm512_maskz_permutexvar_epi32(mask, idx, i1); + i2 = _mm512_maskz_permutexvar_epi32(mask, idx, i2); + i3 = _mm512_maskz_permutexvar_epi32(mask, idx, i3); + i4 = _mm512_maskz_permutexvar_epi32(mask, idx, i4); + i5 = _mm512_maskz_permutexvar_epi32(mask, idx, i5); + i6 = _mm512_maskz_permutexvar_epi32(mask, idx, i6); + i7 = _mm512_maskz_permutexvar_epi32(mask, idx, i7); + i8 = _mm512_maskz_permutexvar_epi32(mask, idx, i8); + i9 = _mm512_maskz_permutexvar_epi32(mask, idx, i9); + i10 = _mm512_maskz_permutexvar_epi32(mask, idx, i10); + i11 = _mm512_maskz_permutexvar_epi32(mask, idx, i11); + } + + // Make sure we use all variables in the loop to return a result + i0 = _mm512_add_epi32(i0, i1); + i2 = _mm512_add_epi32(i2, i3); + i4 = _mm512_add_epi32(i4, i5); + i6 = _mm512_add_epi32(i6, i7); + i8 = _mm512_add_epi32(i8, i9); + i10 = _mm512_add_epi32(i10, i11); + i0 = _mm512_add_epi32(i0, i2); + i4 = _mm512_add_epi32(i4, i6); + i8 = _mm512_add_epi32(i8, i10); + i0 = _mm512_add_epi32(i0, i4); + i0 = _mm512_add_epi32(i0, i8); + + d0 = _mm512_fmadd_pd(d0, d1, d2); + d3 = _mm512_fmadd_pd(d3, d4, d5); + d6 = _mm512_fmadd_pd(d6, d7, d8); + d9 = _mm512_fmadd_pd(d9, d10, d11); + d0 = _mm512_add_pd(d0, d3); + d6 = _mm512_add_pd(d6, d9); + d0 = _mm512_add_pd(d0, d6); + + double data[8]; + int idata[16]; + _mm512_storeu_pd(data, d0); + _mm512_storeu_si512(idata, i0); + + double d = 0; + + for (int i = 0; i < 8; i++) + { + d += data[i] * idata[2*i] * idata[2*i+1]; + } + + return d; +} + +/*\ brief Loop over FMA AVX512 instructions + * + * This function executes a meaningless loop that includes only + * FMA instructions from the AVX512 instruction set. + * We need a bit of complex logic to make sure it cannot be + * optimized away by the compiler. + * + * \param loopCount Number of iterations. Each iteration will + * execute 12 FMA instructions. + * \param seed A double-precision number between 0 and 1. + * To be really certain the loop is not optimized + * away, you should use some timing-related + * function to create this seed at runtime. + * \return Meaningless floating-point number. Make sure you + * add this number to some variable and conditionally + * issue a print statement e.g. if it is negative + * (which should not happen), again to make sure the loop + * cannot be optimized away. + */ +double +executeFmaOnlyLoop(int loopCount, + double seed) +{ + // Make sure all variables are different to avoid gcc optimizing them away + __m512d d0 = _mm512_set1_pd(1.0-0.01*seed); + __m512d d1 = _mm512_set1_pd(1.0-0.02*seed); + __m512d d2 = _mm512_set1_pd(1.0-0.03*seed); + __m512d d3 = _mm512_set1_pd(1.0-0.04*seed); + __m512d d4 = _mm512_set1_pd(1.0-0.05*seed); + __m512d d5 = _mm512_set1_pd(1.0-0.06*seed); + __m512d d6 = _mm512_set1_pd(1.0-0.07*seed); + __m512d d7 = _mm512_set1_pd(1.0-0.08*seed); + __m512d d8 = _mm512_set1_pd(1.0-0.09*seed); + __m512d d9 = _mm512_set1_pd(1.0-0.10*seed); + __m512d d10 = _mm512_set1_pd(1.0-0.11*seed); + __m512d d11 = _mm512_set1_pd(1.0-0.12*seed); + __m512d eps = _mm512_set1_pd(1e-6); + + for (int i = 0; i < loopCount; i++) + { + d0 = _mm512_fmadd_pd(d0, d0, eps); + d1 = _mm512_fmadd_pd(d1, d1, eps); + d2 = _mm512_fmadd_pd(d2, d2, eps); + d3 = _mm512_fmadd_pd(d3, d3, eps); + d4 = _mm512_fmadd_pd(d4, d4, eps); + d5 = _mm512_fmadd_pd(d5, d5, eps); + d6 = _mm512_fmadd_pd(d6, d6, eps); + d7 = _mm512_fmadd_pd(d7, d7, eps); + d8 = _mm512_fmadd_pd(d8, d8, eps); + d9 = _mm512_fmadd_pd(d9, d9, eps); + d10 = _mm512_fmadd_pd(d10, d10, eps); + d11 = _mm512_fmadd_pd(d11, d11, eps); + } + + // Make sure we use all variables in the loop to return a result + d0 = _mm512_fmadd_pd(d0, d1, d2); + d3 = _mm512_fmadd_pd(d3, d4, d5); + d6 = _mm512_fmadd_pd(d6, d7, d8); + d9 = _mm512_fmadd_pd(d9, d10, d11); + d0 = _mm512_add_pd(d0, d3); + d6 = _mm512_add_pd(d6, d9); + d0 = _mm512_add_pd(d0, d6); + + double data[8]; + + _mm512_storeu_pd(data, d0); + + double d = 0; + + for (int i = 0; i < 8; i++) + { + d += data[i]; + } + return d; +} + +int +checkDualAvx512FmaUnits() +{ + uint64_t timeFmaAndShuf = 1e9; // Large value + uint64_t timeFmaOnly = 1e9; // Large value + double dummy; + double seed = (rdtscp() & 0xff) / 256.0; // Create an unpredictable small number between 0 and 1 + + // Make sure the CPU is in AVX512 mode by executing a fairly long loop + dummy = executeFmaOnlyLoop(100000, seed); + + // Execute the mixed FMA/shuffle loop three times + for (int i = 0; i < 3; i++) + { + uint64_t start = rdtscp(); + dummy += executeFmaAndShuffleLoop(1000, seed); + uint64_t res = rdtscp() - start; + timeFmaAndShuf = std::min(timeFmaAndShuf, res); + } + + // Execute the FMA-only loop three times + for (int i = 0; i < 3; i++) + { + uint64_t start = rdtscp(); + dummy += executeFmaOnlyLoop(1000, seed); + uint64_t res = rdtscp() - start; + timeFmaOnly = std::min(timeFmaOnly, res); + } + + // Dummy can never be negative, but by using it in the + // conditional it cannot be optimized away. + return (timeFmaAndShuf > 1.5 * timeFmaOnly || dummy < 0); +} + + +#endif // SIMD_AVX_512_CXX_SUPPORTED + +/*! \brief Mutex to guard the execution of the timing test + * + * We only execute the test once, and return the saved result + * on subsequent calls. + */ +std::mutex initMutex; + +} // namespace anonymous + +int +identifyAvx512FmaUnits() +{ + static bool initialized = false; + static int result = false; + + if (!initialized) + { + std::lock_guard lock(initMutex); + + if (!initialized) + { + // For the standalone test binary we assume it will + // only be executed on AVX512 hardware, but for the + // library version we check the hardware support. +#ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE + bool haveAvx512Hardware = true; +#else + bool haveAvx512Hardware = CpuInfo::detect().feature(CpuInfo::Feature::X86_Avx512F); +#endif + + if (haveAvx512Hardware) + { +#if SIMD_AVX_512_CXX_SUPPORTED + result = checkDualAvx512FmaUnits() ? 2 : 1; +#else + result = -1; // Cannot run the tests +#endif + } + else + { + result = 0; // Not AVX-512 hardware + } + initialized = true; + } + } + return result; +} + +} // namespace gmx + +#ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE +int +main() +{ + printf("%d\n", gmx::identifyAvx512FmaUnits()); + return 0; +} +#endif diff --git a/src/gromacs/hardware/identifyavx512fmaunits.h b/src/gromacs/hardware/identifyavx512fmaunits.h new file mode 100644 index 0000000000..bcd6f13679 --- /dev/null +++ b/src/gromacs/hardware/identifyavx512fmaunits.h @@ -0,0 +1,57 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \libinternal \file + * \brief Defines a routine to check the number of AVX512 fma units + * + * \author Erik Lindahl + * \inlibraryapi + * \ingroup module_hardware + */ + +namespace gmx +{ + +/*! \brief Test whether machine has dual AVX512 FMA units + * + * \return 1 or 2 for the number of AVX512 FMA units if AVX512 + * support is present, 0 if we know the hardware does + * not have AVX512 support, or -1 if the test cannot + * run because the compiler lacked AVX512 support. + */ +int +identifyAvx512FmaUnits(); + +} // namespace gmx diff --git a/src/gromacs/hardware/printhardware.cpp b/src/gromacs/hardware/printhardware.cpp index d3b432d424..17b120765b 100644 --- a/src/gromacs/hardware/printhardware.cpp +++ b/src/gromacs/hardware/printhardware.cpp @@ -47,6 +47,7 @@ #include "gromacs/hardware/cpuinfo.h" #include "gromacs/hardware/hardwaretopology.h" #include "gromacs/hardware/hw_info.h" +#include "gromacs/hardware/identifyavx512fmaunits.h" #include "gromacs/mdtypes/commrec.h" #include "gromacs/simd/support.h" #include "gromacs/utility/basedefinitions.h" @@ -227,19 +228,24 @@ static std::string detected_hardware_string(const gmx_hw_info_t *hwinfo, s += gmx::formatString("\n"); } - s += gmx::formatString(" SIMD instructions most likely to fit this hardware: %s", - gmx::simdString(static_cast(hwinfo->simd_suggest_min)).c_str()); - - if (hwinfo->simd_suggest_max > hwinfo->simd_suggest_min) + if (cpuInfo.feature(gmx::CpuInfo::Feature::X86_Avx512F)) { - s += gmx::formatString(" - %s", gmx::simdString(static_cast(hwinfo->simd_suggest_max)).c_str()); + int avx512fmaunits = gmx::identifyAvx512FmaUnits(); + s += gmx::formatString(" Number of AVX-512 FMA units:"); + if (avx512fmaunits > 0) + { + s += gmx::formatString(" %d", avx512fmaunits); + if (avx512fmaunits == 1) + { + s += gmx::formatString(" (AVX2 is faster w/o 2 AVX-512 FMA units)"); + } + } + else + { + s += gmx::formatString(" Cannot run AVX-512 detection - assuming 2"); + } + s += gmx::formatString("\n"); } - s += gmx::formatString("\n"); - - s += gmx::formatString(" SIMD instructions selected at GROMACS compile time: %s\n", - gmx::simdString(gmx::simdCompiled()).c_str()); - - s += gmx::formatString("\n"); s += gmx::formatString(" Hardware topology: "); switch (hwTop.supportLevel()) @@ -367,14 +373,9 @@ void gmx_print_detected_hardware(FILE *fplog, const t_commrec *cr, fprintf(fplog, "%s\n", detected.c_str()); } - if (MULTIMASTER(cr)) - { - std::string detected; - - detected = detected_hardware_string(hwinfo, FALSE); - - fprintf(stderr, "%s\n", detected.c_str()); - } + // Do not spam stderr with all our internal information unless + // there was something that actually went wrong; general information + // belongs in the logfile. /* Check the compiled SIMD instruction set against that of the node * with the lowest SIMD level support (skip if SIMD detection did not work) diff --git a/src/gromacs/simd/support.cpp b/src/gromacs/simd/support.cpp index 3ba4f74d7a..459987ba10 100644 --- a/src/gromacs/simd/support.cpp +++ b/src/gromacs/simd/support.cpp @@ -55,6 +55,8 @@ #include #include "gromacs/hardware/cpuinfo.h" +#include "gromacs/hardware/identifyavx512fmaunits.h" +#include "gromacs/utility/stringutil.h" namespace gmx { @@ -105,7 +107,8 @@ simdSuggested(const CpuInfo &c) } else if (c.feature(CpuInfo::Feature::X86_Avx512F)) { - suggested = SimdType::X86_Avx512; + // If we could not identify the number of AVX512 FMA units we assume 2 + suggested = ( identifyAvx512FmaUnits() == 1 ) ? SimdType::X86_Avx2 : SimdType::X86_Avx512; } else if (c.feature(CpuInfo::Feature::X86_Avx2)) { @@ -236,38 +239,83 @@ simdCheck(gmx::SimdType wanted, FILE * log, bool warnToStdErr) { - SimdType compiled = simdCompiled(); + SimdType compiled = simdCompiled(); - // Normally it is close to catastrophic if the compiled SIMD type is larger than - // the supported one, but AVX128Fma is an exception: AMD CPUs will (strongly) prefer - // AVX128Fma, but they will work fine with AVX too. Thus, make an exception for this. - if (compiled > wanted && !(compiled == SimdType::X86_Avx && wanted == SimdType::X86_Avx128Fma)) + gmx::TextLineWrapper wrapper; + std::string logMsg; + std::string warnMsg; + + wrapper.settings().setLineLength(78); + + if (compiled == SimdType::X86_Avx2 && wanted == SimdType::X86_Avx512) { - fprintf(stderr, "Warning: SIMD instructions newer than hardware. Program will likely crash.\n" - "SIMD instructions most likely to fit this hardware: %s\n" - "SIMD instructions selected at GROMACS compile time: %s\n\n", - simdString(wanted).c_str(), - simdString(compiled).c_str()); + logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n" + "SIMD instructions selected at compile time: %s\n" + "This program was compiled for different hardware than you are running on, " + "which could influence performance. This build might have been configured on " + "a login node with only a single AVX-512 FMA unit (in which case AVX2 is faster), " + "while the node you are running on has dual AVX-512 FMA units.", + simdString(wanted).c_str(), simdString(compiled).c_str())); + warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).", + simdString(compiled).c_str(), simdString(wanted).c_str())); + } + else if (compiled == SimdType::X86_Avx512 && wanted == SimdType::X86_Avx2 && identifyAvx512FmaUnits() == 1) + { + // The reason for explicitly checking the number of FMA units above is to avoid triggering + // this conditional if the AVX2 SIMD was requested by some other node in a heterogeneous MPI run. + logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n" + "SIMD instructions selected at compile time: %s\n" + "This program was compiled for different hardware than you are running on, " + "which could influence performance." + "This host supports AVX-512, but since it only has 1 AVX-512" + "FMA unit, it would be faster to use AVX2 instead.", + simdString(wanted).c_str(), simdString(compiled).c_str())); + warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).", + simdString(compiled).c_str(), simdString(wanted).c_str())); + } + else if (compiled == SimdType::X86_Avx2 && wanted == SimdType::X86_Avx2_128) + { + logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n" + "SIMD instructions selected at compile time: %s\n" + "This program was compiled for different hardware than you are running on, " + "which could influence performance." + "Ryzen/Threadripper CPUs support 256-bit AVX2, but 128-bit is faster.", + simdString(wanted).c_str(), simdString(compiled).c_str())); + warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).", + simdString(compiled).c_str(), simdString(wanted).c_str())); + } + else if (compiled > wanted && !(compiled == SimdType::X86_Avx && wanted == SimdType::X86_Avx128Fma)) + { + // Normally it is close to catastrophic if the compiled SIMD type is larger than + // the supported one, but AVX128Fma is an exception: AMD CPUs will (strongly) prefer + // AVX128Fma, but they will work fine with AVX too. Thus, make an exception for this. + logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n" + "SIMD instructions selected at compile time: %s\n" + "Compiled SIMD newer than requested; program might crash.", + simdString(wanted).c_str(), simdString(compiled).c_str())); + warnMsg = logMsg; } else if (wanted != compiled) { // This warning will also occur if compiled is X86_Avx and wanted is X86_Avx128Fma + logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n" + "SIMD instructions selected at compile time: %s\n" + "This program was compiled for different hardware than you are running on, " + "which could influence performance.", + simdString(wanted).c_str(), simdString(compiled).c_str())); + warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).", + simdString(compiled).c_str(), simdString(wanted).c_str())); + } - if (log != nullptr) - { - fprintf(log, "\nBinary not matching hardware - you might be losing performance.\n" - "SIMD instructions most likely to fit this hardware: %s\n" - "SIMD instructions selected at GROMACS compile time: %s\n\n", - simdString(wanted).c_str(), - simdString(compiled).c_str()); - } - if (warnToStdErr) - { - fprintf(stderr, "Compiled SIMD instructions: %s, GROMACS could use %s on this machine, which is better.\n\n", - simdString(compiled).c_str(), - simdString(wanted).c_str()); - } + if (log != nullptr) + { + fprintf(log, "%s\n", logMsg.c_str()); + } + if (warnToStdErr) + { + fprintf(stderr, "%s\n", warnMsg.c_str()); } + return (wanted == compiled); } -- 2.11.4.GIT