From c6ce8efc35a5073d096aa2043136d0fc50d639a4 Mon Sep 17 00:00:00 2001
From: Erik Lindahl <erik@kth.se>
Date: Mon, 13 Nov 2017 10:48:05 -0700
Subject: [PATCH] Detect AVX-512 FMA units to choose best SIMD

Add a test program that times AVX-512 code to
detect single vs. dual AVX-512 FMA units.
Added CMake code to always compile this file
with AVX-512 flags, both at CMake configuration
and runtime.
Tested to work on both AVX2 and AVX-512 hardware
with dual FMAs, and by manually faking single
FMA units, but when we get access to hardware
with a single AVX-512 FMA unit we need to check
that we produce the correct result.

Change-Id: I6240e864bc77f95085c5cd3303a84ab581eb3662
---
 cmake/gmxDetectAvx512FmaUnits.cmake             |  90 ++++++
 cmake/gmxDetectSimd.cmake                       |  13 +-
 cmake/gmxManageSimd.cmake                       |  18 ++
 cmake/gmxSimdFlags.cmake                        |  66 ++--
 src/config.h.cmakein                            |   3 +
 src/gromacs/CMakeLists.txt                      |   5 +
 src/gromacs/hardware/CMakeLists.txt             |   1 +
 src/gromacs/hardware/identifyavx512fmaunits.cpp | 383 ++++++++++++++++++++++++
 src/gromacs/hardware/identifyavx512fmaunits.h   |  57 ++++
 src/gromacs/hardware/printhardware.cpp          |  39 +--
 src/gromacs/simd/support.cpp                    |  98 ++++--
 11 files changed, 695 insertions(+), 78 deletions(-)
 create mode 100644 cmake/gmxDetectAvx512FmaUnits.cmake
 create mode 100644 src/gromacs/hardware/identifyavx512fmaunits.cpp
 create mode 100644 src/gromacs/hardware/identifyavx512fmaunits.h

diff --git a/cmake/gmxDetectAvx512FmaUnits.cmake b/cmake/gmxDetectAvx512FmaUnits.cmake
new file mode 100644
index 0000000000..21120c37d5
--- /dev/null
+++ b/cmake/gmxDetectAvx512FmaUnits.cmake
@@ -0,0 +1,90 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2017, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+include(gmxSimdFlags)
+
+# gmx_detect_avx_512_fma_units()
+#
+# Try to detect whether the host has one or two AVX-512 FMA units
+# by executing a small program. This will only work on hosts that
+# support AVX-512. If successful it sets RESULT to 1 or 2 for the
+# number of AVX-512 FMA units, and otherwise -1.
+#
+function(gmx_detect_avx_512_fma_units RESULT)
+    if(CMAKE_CROSSCOMPILING)
+        set(${RESULT} -1 CACHE INTERNAL "Result of test for number of AVX-512 FMA units")
+    else()
+        set(AVX_512_FMA_UNIT_DETECTION_BINARY "${PROJECT_BINARY_DIR}/CMakeFiles/GmxDetectAvx512FmaUnits${CMAKE_EXECUTABLE_SUFFIX}")
+        if(NOT AVX_512_FMA_UNIT_DETECTION_COMPILED)
+
+            # Find flags required for AVX-512
+            gmx_find_simd_avx_512_flags(SIMD_AVX_512_C_SUPPORTED SIMD_AVX_512_CXX_SUPPORTED
+                                        SIMD_AVX_512_C_FLAGS SIMD_AVX_512_CXX_FLAGS)
+
+            if(${SIMD_AVX_512_CXX_SUPPORTED})
+                # Compile the detection program
+
+                set(_compile_definitions "-I${PROJECT_SOURCE_DIR}/src -DGMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE ${SIMD_AVX_512_CXX_FLAGS} ${GMX_STDLIB_CXX_FLAGS}")
+                try_compile(AVX_512_FMA_UNIT_DETECTION_COMPILED
+                    "${PROJECT_BINARY_DIR}"
+                    "${PROJECT_SOURCE_DIR}/src/gromacs/hardware/identifyavx512fmaunits.cpp"
+                    COMPILE_DEFINITIONS "${_compile_definitions}"
+                    LINK_LIBRARIES "${GMX_STDLIB_LIBRARIES}"
+                    OUTPUT_VARIABLE AVX_512_FMA_UNIT_DETECTION_COMPILED_OUTPUT
+                    COPY_FILE ${AVX_512_FMA_UNIT_DETECTION_BINARY})
+                if(NOT AVX_512_FMA_UNIT_DETECTION_COMPILED AND NOT RUN_AVX_512_FMA_UNIT_DETECTION_COMPILATION_QUIETLY)
+                  message(STATUS "Could not identify number of AVX-512 units - detection program did not compile")
+                endif()
+                set(RUN_AVX_512_FMA_UNIT_DETECTION_COMPILATION_QUIETLY TRUE CACHE INTERNAL "Keep quiet on any future compilation attempts")
+            endif()
+
+            if(AVX_512_FMA_UNIT_DETECTION_COMPILED)
+                # Run the program
+                if(NOT DEFINED ${RESULT})
+                    execute_process(COMMAND ${AVX_512_FMA_UNIT_DETECTION_BINARY}
+                        RESULT_VARIABLE RESULT_VAR
+                        OUTPUT_VARIABLE OUTPUT_VAR_TEMP
+                        ERROR_QUIET)
+                    if (RESULT_VAR EQUAL 0)
+                        string(STRIP "${OUTPUT_VAR_TEMP}" OUTPUT_VAR)
+                        set(${RESULT} ${OUTPUT_VAR_TEMP} CACHE INTERNAL "Result of test for number of AVX-512 FMA units")
+                    else()
+                        message(STATUS "Could not identify number of AVX-512 units - detection program did run successfully")
+                        set(${RESULT} -1 CACHE INTERNAL "Result of test for number of AVX-512 FMA units")
+                    endif()
+                endif()
+            endif()
+        endif()
+      endif()
+endfunction()
diff --git a/cmake/gmxDetectSimd.cmake b/cmake/gmxDetectSimd.cmake
index f1e5821a71..79987d2e8c 100644
--- a/cmake/gmxDetectSimd.cmake
+++ b/cmake/gmxDetectSimd.cmake
@@ -55,6 +55,8 @@ include(gmxDetectTargetArchitecture)
 gmx_detect_target_architecture()
 
 include(gmxDetectCpu)
+include(gmxDetectAvx512FmaUnits)
+
 function(gmx_suggest_simd _suggested_simd)
     if (NOT SUGGEST_SIMD_QUIETLY)
         message(STATUS "Detecting best SIMD instructions for this CPU")
@@ -77,7 +79,16 @@ function(gmx_suggest_simd _suggested_simd)
             if(CPU_DETECTION_FEATURES MATCHES " avx512er ")
                 set(OUTPUT_SIMD "AVX_512_KNL")
             elseif(CPU_DETECTION_FEATURES MATCHES " avx512f ")
-                set(OUTPUT_SIMD "AVX_512")
+                gmx_detect_avx_512_fma_units(NUMBER_OF_AVX_512_FMA_UNITS)
+                if(NUMBER_OF_AVX_512_FMA_UNITS EQUAL 2)
+                    set(OUTPUT_SIMD "AVX_512")
+                elseif(NUMBER_OF_AVX_512_FMA_UNITS EQUAL 1)
+                    message(STATUS "This host supports AVX-512, but only has 1 AVX-512 FMA unit, so AVX2 will be faster.")
+                    set(OUTPUT_SIMD "AVX2_256")
+                else()
+                    message(WARNING "Could not run code to detect number of AVX-512 FMA units - assuming 2.")
+                    set(OUTPUT_SIMD "AVX_512")
+                endif()
             elseif(CPU_DETECTION_FEATURES MATCHES " avx2 ")
                 if(CPU_DETECTION_FEATURES MATCHES " amd ")
                     set(OUTPUT_SIMD "AVX2_128")
diff --git a/cmake/gmxManageSimd.cmake b/cmake/gmxManageSimd.cmake
index a0de5bda67..e592197d5c 100644
--- a/cmake/gmxManageSimd.cmake
+++ b/cmake/gmxManageSimd.cmake
@@ -314,6 +314,24 @@ if (SIMD_CHANGED AND DEFINED SIMD_STATUS_MESSAGE)
     message(STATUS "${SIMD_STATUS_MESSAGE}")
 endif()
 
+# While AVX-512 is a more recent SIMD ISA than AVX2, some Intel CPUs only have
+# a single AVX-512 FMA unit, but two AVX2 FMA units, and then it is better to
+# use AVX2. The only way to test this is to execute a small timing loop.
+# To be able to recommend the user whether s/he should try AVX-512 instead of
+# AVX2, we need to compile a single file with AVX512 flags. We do this
+# automatically, but this option provides a way to turn it off in case it
+# breaks something. The actual test source file is built if
+# SIMD_AVX_512_CXX_SUPPORTED is set, so it will always be included if we have
+# GMX_SIMD=AVX_512.
+set(GMX_ENABLE_AVX512_TESTS ON CACHE INTERNAL "Compile AVX512 code to test FMA units, even when not using AVX512 SIMD")
+mark_as_advanced(GMX_ENABLE_AVX512_TESTS)
+
+if(GMX_ENABLE_AVX512_TESTS AND
+    (GMX_SIMD_ACTIVE STREQUAL "AVX_256" OR GMX_SIMD_ACTIVE STREQUAL "AVX2_256" OR GMX_SIMD_ACTIVE STREQUAL "AVX2_128"))
+    gmx_find_simd_avx_512_flags(SIMD_AVX_512_C_SUPPORTED SIMD_AVX_512_CXX_SUPPORTED
+                                SIMD_AVX_512_C_FLAGS SIMD_AVX_512_CXX_FLAGS)
+endif()
+
 # By default, 32-bit windows cannot pass SIMD (SSE/AVX) arguments in registers,
 # and even on 64-bit (all platforms) it is only used for a handful of arguments.
 # The __vectorcall (MSVC, from MSVC2013) or __regcall (ICC) calling conventions
diff --git a/cmake/gmxSimdFlags.cmake b/cmake/gmxSimdFlags.cmake
index cbde917b54..441c65e693 100644
--- a/cmake/gmxSimdFlags.cmake
+++ b/cmake/gmxSimdFlags.cmake
@@ -91,13 +91,13 @@ function(gmx_find_simd_sse2_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VARIAB
         "-msse2" "/arch:SSE2" "-hgnu")
 
     if(${SIMD_SSE2_C_FLAGS_RESULT})
-        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_SSE2_C_FLAGS}" PARENT_SCOPE)
+        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_SSE2_C_FLAGS}" CACHE INTERNAL "C flags required for SSE2 instructions")
     endif()
     if(${SIMD_SSE2_CXX_FLAGS_RESULT})
-        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_SSE2_CXX_FLAGS}" PARENT_SCOPE)
+        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_SSE2_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for SSE2 instructions")
     endif()
     set(${C_FLAGS_RESULT} ${SIMD_SSE2_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE2 C flags" FORCE)
-    set(${CXX_FLAGS_RESULT} ${SIMD_SSE2_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE2 CXX flags" FORCE)
+    set(${CXX_FLAGS_RESULT} ${SIMD_SSE2_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE2 C++ flags" FORCE)
 endfunction()
 
 # SSE4.1
@@ -112,13 +112,13 @@ function(gmx_find_simd_sse4_1_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VARI
         "-msse4.1" "/arch:SSE4.1" "/arch:SSE2" "-hgnu")
 
     if(${SIMD_SSE4_1_C_FLAGS_RESULT})
-        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_SSE4_1_C_FLAGS}" PARENT_SCOPE)
+        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_SSE4_1_C_FLAGS}" CACHE INTERNAL "C flags required for SSE4.1 instructions")
     endif()
     if(${SIMD_SSE4_1_CXX_FLAGS_RESULT})
-        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_SSE4_1_CXX_FLAGS}" PARENT_SCOPE)
+        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_SSE4_1_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for SSE4.1 instructions")
     endif()
     set(${C_FLAGS_RESULT} ${SIMD_SSE4_1_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE4.1 C flags" FORCE)
-    set(${CXX_FLAGS_RESULT} ${SIMD_SSE4_1_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE4.1 CXX flags" FORCE)
+    set(${CXX_FLAGS_RESULT} ${SIMD_SSE4_1_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE4.1 C++ flags" FORCE)
 endfunction()
 
 # AVX, but using only 128-bit instructions and FMA (AMD XOP processors)
@@ -202,13 +202,13 @@ function(gmx_find_simd_avx_128_fma_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS
     endif()
 
     if(${SIMD_AVX_128_FMA_C_FLAGS_RESULT})
-        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_GENERIC_C_FLAGS} ${SIMD_AVX_AMD_FMA_C_FLAGS} ${SIMD_AVX_XOP_C_FLAGS}" PARENT_SCOPE)
+        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_GENERIC_C_FLAGS} ${SIMD_AVX_AMD_FMA_C_FLAGS} ${SIMD_AVX_XOP_C_FLAGS}" CACHE INTERNAL "C flags required for 128-bit AVX with AMD FMA instructions")
     endif()
     if(${SIMD_AVX_128_FMA_CXX_FLAGS_RESULT})
-        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_GENERIC_CXX_FLAGS} ${SIMD_AVX_AMD_FMA_CXX_FLAGS} ${SIMD_AVX_XOP_CXX_FLAGS}" PARENT_SCOPE)
+        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_GENERIC_CXX_FLAGS} ${SIMD_AVX_AMD_FMA_CXX_FLAGS} ${SIMD_AVX_XOP_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for 128-bit AVX with AMD FMA instructions")
     endif()
     set(${C_FLAGS_RESULT} ${SIMD_AVX_128_FMA_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for 128-bit AVX with AMD FMA C flags" FORCE)
-    set(${CXX_FLAGS_RESULT} ${SIMD_AVX_128_FMA_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for 128-bit AVX with AMD FMA CXX flags" FORCE)
+    set(${CXX_FLAGS_RESULT} ${SIMD_AVX_128_FMA_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for 128-bit AVX with AMD FMA C++ flags" FORCE)
 endfunction()
 
 
@@ -223,13 +223,13 @@ function(gmx_find_simd_avx_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VARIABL
         "-mavx" "/arch:AVX" "-hgnu")
 
     if(${SIMD_AVX_C_FLAGS_RESULT})
-        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_C_FLAGS}" PARENT_SCOPE)
+        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_C_FLAGS}" CACHE INTERNAL "C flags required for AVX instructions")
     endif()
     if(${SIMD_AVX_CXX_FLAGS_RESULT})
-        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_CXX_FLAGS}" PARENT_SCOPE)
+        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for AVX instructions")
     endif()
     set(${C_FLAGS_RESULT} ${SIMD_AVX_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX C flags" FORCE)
-    set(${CXX_FLAGS_RESULT} ${SIMD_AVX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX CXX flags" FORCE)
+    set(${CXX_FLAGS_RESULT} ${SIMD_AVX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX C++ flags" FORCE)
 endfunction()
 
 # AVX2
@@ -243,13 +243,13 @@ function(gmx_find_simd_avx2_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VARIAB
         "-march=core-avx2" "-mavx2" "/arch:AVX" "-hgnu") # no AVX2-specific flag for MSVC yet
 
     if(${SIMD_AVX2_C_FLAGS_RESULT})
-        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX2_C_FLAGS}" PARENT_SCOPE)
+        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX2_C_FLAGS}" CACHE INTERNAL "C flags required for AVX2 instructions")
     endif()
     if(${SIMD_AVX2_CXX_FLAGS_RESULT})
-        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX2_CXX_FLAGS}" PARENT_SCOPE)
+        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX2_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for AVX2 instructions")
     endif()
     set(${C_FLAGS_RESULT} ${SIMD_AVX2_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX2 C flags" FORCE)
-    set(${CXX_FLAGS_RESULT} ${SIMD_AVX2_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX2 CXX flags" FORCE)
+    set(${CXX_FLAGS_RESULT} ${SIMD_AVX2_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX2 C++ flags" FORCE)
 endfunction()
 
 
@@ -265,13 +265,13 @@ function(gmx_find_simd_avx_512_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VAR
         "-xCORE-AVX512 -qopt-zmm-usage=high" "-xCORE-AVX512" "-mavx512f -mfma" "-mavx512f" "/arch:AVX" "-hgnu") # no AVX_512F flags known for MSVC yet. ICC should use ZMM if code anyhow uses ZMM
 
     if(${SIMD_AVX_512_C_FLAGS_RESULT})
-        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_512_C_FLAGS}" PARENT_SCOPE)
+        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_512_C_FLAGS}" CACHE INTERNAL "C flags required for AVX-512 instructions")
     endif()
     if(${SIMD_AVX_512_CXX_FLAGS_RESULT})
-        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_512_CXX_FLAGS}" PARENT_SCOPE)
+        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_512_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for AVX-512 instructions")
     endif()
     set(${C_FLAGS_RESULT} ${SIMD_AVX_512_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 C flags" FORCE)
-    set(${CXX_FLAGS_RESULT} ${SIMD_AVX_512_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 CXX flags" FORCE)
+    set(${CXX_FLAGS_RESULT} ${SIMD_AVX_512_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 C++ flags" FORCE)
 endfunction()
 
 
@@ -287,13 +287,13 @@ function(gmx_find_simd_avx_512_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VAR
         "-xMIC-AVX512" "-mavx512er -mfma" "-mavx512er" "/arch:AVX" "-hgnu") # no AVX_512ER flags known for MSVC yet
 
     if(${SIMD_AVX_512_KNL_C_FLAGS_RESULT})
-        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_512_KNL_C_FLAGS}" PARENT_SCOPE)
+        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_512_KNL_C_FLAGS}" CACHE INTERNAL "C flags required for AVX-512 for KNL instructions")
     endif()
     if(${SIMD_AVX_512_KNL_CXX_FLAGS_RESULT})
-        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_512_KNL_CXX_FLAGS}" PARENT_SCOPE)
+        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_512_KNL_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for AVX-512 for KNL instructions")
     endif()
     set(${C_FLAGS_RESULT} ${SIMD_AVX_512_KNL_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 for KNL C flags" FORCE)
-    set(${CXX_FLAGS_RESULT} ${SIMD_AVX_512_KNL_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 for KNL CXX flags" FORCE)
+    set(${CXX_FLAGS_RESULT} ${SIMD_AVX_512_KNL_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 for KNL C++ flags" FORCE)
 endfunction()
 
 
@@ -308,13 +308,13 @@ function(gmx_find_simd_arm_neon_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VA
         "-mfpu=neon-vfpv4" "-mfpu=neon" "")
 
     if(${SIMD_ARM_NEON_C_FLAGS_RESULT})
-        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_ARM_NEON_C_FLAGS}" PARENT_SCOPE)
+        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_ARM_NEON_C_FLAGS}" CACHE INTERNAL "C flags required for Arm Neon instructions")
     endif()
     if(${SIMD_ARM_NEON_CXX_FLAGS_RESULT})
-        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_ARM_NEON_CXX_FLAGS}" PARENT_SCOPE)
+        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_ARM_NEON_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for Arm Neon instructions")
     endif()
     set(${C_FLAGS_RESULT} ${SIMD_ARM_NEON_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon C flags" FORCE)
-    set(${CXX_FLAGS_RESULT} ${SIMD_ARM_NEON_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon CXX flags" FORCE)
+    set(${CXX_FLAGS_RESULT} ${SIMD_ARM_NEON_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon C++ flags" FORCE)
 endfunction()
 
 # Arm Neon Asimd (64-bit ARM)
@@ -328,13 +328,13 @@ function(gmx_find_simd_arm_neon_asimd_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FL
         "")
 
     if(${SIMD_ARM_NEON_ASIMD_C_FLAGS_RESULT})
-        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_ARM_NEON_ASIMD_C_FLAGS}" PARENT_SCOPE)
+        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_ARM_NEON_ASIMD_C_FLAGS}" CACHE INTERNAL "C flags required for Arm Neon Asimd instructions")
     endif()
     if(${SIMD_ARM_NEON_ASIMD_CXX_FLAGS_RESULT})
-        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_ARM_NEON_ASIMD_CXX_FLAGS}" PARENT_SCOPE)
+        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_ARM_NEON_ASIMD_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for Arm Neon Asimd instructions")
     endif()
     set(${C_FLAGS_RESULT} ${SIMD_ARM_NEON_ASIMD_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon Asimd C flags" FORCE)
-    set(${CXX_FLAGS_RESULT} ${SIMD_ARM_NEON_ASIMD_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon Asimd CXX flags" FORCE)
+    set(${CXX_FLAGS_RESULT} ${SIMD_ARM_NEON_ASIMD_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon Asimd C++ flags" FORCE)
 endfunction()
 
 # IBM VMX (power6)
@@ -348,13 +348,13 @@ function(gmx_find_simd_ibm_vmx_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VAR
         "-maltivec -mabi=altivec" "-qarch=auto -qaltivec")
 
     if(${SIMD_IBM_VMX_C_FLAGS_RESULT})
-        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_IBM_VMX_C_FLAGS}" PARENT_SCOPE)
+        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_IBM_VMX_C_FLAGS}" CACHE INTERNAL "C flags required for IBM VMX instructions")
     endif()
     if(${SIMD_IBM_VMX_CXX_FLAGS_RESULT})
-        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_IBM_VMX_CXX_FLAGS}" PARENT_SCOPE)
+        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_IBM_VMX_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for IBM VMX instructions")
     endif()
     set(${C_FLAGS_RESULT} ${SIMD_IBM_VMX_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VMX C flags" FORCE)
-    set(${CXX_FLAGS_RESULT} ${SIMD_IBM_VMX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VMX CXX flags" FORCE)
+    set(${CXX_FLAGS_RESULT} ${SIMD_IBM_VMX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VMX C++ flags" FORCE)
 endfunction()
 
 # IBM VSX (power7 and later)
@@ -368,11 +368,11 @@ function(gmx_find_simd_ibm_vsx_flags C_FLAGS_RESULT CXX_FLAGS_RESULT C_FLAGS_VAR
         "-mvsx" "-maltivec -mabi=altivec" "-qarch=auto -qaltivec")
 
     if(${SIMD_IBM_VSX_C_FLAGS_RESULT})
-        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_IBM_VSX_C_FLAGS}" PARENT_SCOPE)
+        set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_IBM_VSX_C_FLAGS}" CACHE INTERNAL "C flags required for IBM VSX instructions")
     endif()
     if(${SIMD_IBM_VSX_CXX_FLAGS_RESULT})
-        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_IBM_VSX_CXX_FLAGS}" PARENT_SCOPE)
+        set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_IBM_VSX_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for IBM VSX instructions")
     endif()
     set(${C_FLAGS_RESULT} ${SIMD_IBM_VSX_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VSX C flags" FORCE)
-    set(${CXX_FLAGS_RESULT} ${SIMD_IBM_VSX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VSX CXX flags" FORCE)
+    set(${CXX_FLAGS_RESULT} ${SIMD_IBM_VSX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VSX C++ flags" FORCE)
 endfunction()
diff --git a/src/config.h.cmakein b/src/config.h.cmakein
index b4f977b865..dfdb3f10d7 100644
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -134,6 +134,9 @@
 /* Target mantissa accuracy for SIMD double precision math */
 #define GMX_SIMD_ACCURACY_BITS_DOUBLE @GMX_SIMD_ACCURACY_BITS_DOUBLE@
 
+/* Enable code that requires AVX-512 instruction support, without GMX_SIMD=AVX_512 */
+#cmakedefine01 SIMD_AVX_512_CXX_SUPPORTED
+
 /* Whether a double-precision configuration may target accuracy equivalent to single precision */
 #cmakedefine01 GMX_RELAXED_DOUBLE_PRECISION
 
diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt
index cf48a183b7..37d2e029d4 100644
--- a/src/gromacs/CMakeLists.txt
+++ b/src/gromacs/CMakeLists.txt
@@ -40,6 +40,7 @@ endif()
 
 set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
 set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+set_property(GLOBAL PROPERTY GMX_AVX_512_SOURCE)
 
 function (_gmx_add_files_to_property PROPERTY)
     foreach (_file ${ARGN})
@@ -191,6 +192,10 @@ if (HAS_NO_UNUSED)
 endif()
 set_source_files_properties(selection/scanner.cpp PROPERTIES COMPILE_FLAGS "${_scanner_cpp_compiler_flags}")
 
+if(SIMD_AVX_512_CXX_SUPPORTED)
+    set_source_files_properties(hardware/identifyavx512fmaunits.cpp PROPERTIES COMPILE_FLAGS "${SIMD_AVX_512_CXX_FLAGS}")
+endif()
+
 gmx_setup_tng_for_libgromacs()
 
 target_link_libraries(libgromacs
diff --git a/src/gromacs/hardware/CMakeLists.txt b/src/gromacs/hardware/CMakeLists.txt
index c758f54374..6bc268c766 100644
--- a/src/gromacs/hardware/CMakeLists.txt
+++ b/src/gromacs/hardware/CMakeLists.txt
@@ -38,6 +38,7 @@ gmx_add_libgromacs_sources(
     gpu_hw_info.cpp
     hardwaretopology.cpp
     printhardware.cpp
+    identifyavx512fmaunits.cpp
     )
 
 if (BUILD_TESTING)
diff --git a/src/gromacs/hardware/identifyavx512fmaunits.cpp b/src/gromacs/hardware/identifyavx512fmaunits.cpp
new file mode 100644
index 0000000000..93f1c308c9
--- /dev/null
+++ b/src/gromacs/hardware/identifyavx512fmaunits.cpp
@@ -0,0 +1,383 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Implements a routine to check the number of AVX512 fma units
+ *
+ * Just as the CpuInfo code, we need to be able to compile this file in stand-alone mode
+ * to set the SIMD acceleration and similar things during CMake configuration.
+ */
+
+#ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
+#include "gmxpre.h"
+#endif
+
+#include "identifyavx512fmaunits.h"
+
+#ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
+#define SIMD_AVX_512_CXX_SUPPORTED 1
+#else
+#include "config.h"
+#endif
+
+#if SIMD_AVX_512_CXX_SUPPORTED
+#include <immintrin.h>
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+#endif // SIMD_AVX_512_CXX_SUPPORTED
+
+#include <cstdint>
+#include <cstdio>
+
+#include <algorithm>
+#include <mutex>
+
+#ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
+#include "gromacs/hardware/cpuinfo.h"
+#endif
+
+namespace gmx
+{
+
+namespace
+{
+
+#if SIMD_AVX_512_CXX_SUPPORTED
+// Use a local routine to read the timestep counter just on x86 to avoid dependence
+// on the Gromacs cycle counter module.
+uint64_t rdtscp(void)
+{
+#ifdef MSC_VER
+    unsigned int ui;
+    return static_cast<uint64_t>(__rdtscp(&ui));
+#else
+    uint32_t low;
+    uint32_t high;
+
+    __asm__ __volatile__("rdtscp" : "=a" (low), "=d" (high) :: "ecx" );
+    return (static_cast<uint64_t>(high) << 32) | low;
+#endif
+}
+
+/*\ brief Loop over mixed FMA and shuffle AVX512 instructions
+ *
+ * This function executes a meaningless loop that includes both
+ * FMA and shuffle instructions from the AVX512 instruction set.
+ * We need a bit of complex logic to make sure it cannot be
+ * optimized away by the compiler.
+ *
+ * \param loopCount  Number of iterations. Each iteration will
+ *                   execute 12 FMA and 12 shuffle instructions.
+ * \param seed       A double-precision number between 0 and 1.
+ *                   To be really certain the loop is not optimized
+ *                   away, you should use some timing-related
+ *                   function to create this seed at runtime.
+ * \return Meaningless floating-point number. Make sure you
+ *         add this number to some variable and conditionally
+ *         issue a print statement e.g. if it is negative
+ *         (which should not happen), again to make sure the loop
+ *         cannot be optimized away.
+ */
+double
+executeFmaAndShuffleLoop(int     loopCount,
+                         double  seed)
+{
+    // Make sure all variables are different to avoid gcc optimizing them away
+    __m512d   d0   = _mm512_set1_pd(1.0-0.01*seed);
+    __m512d   d1   = _mm512_set1_pd(1.0-0.02*seed);
+    __m512d   d2   = _mm512_set1_pd(1.0-0.03*seed);
+    __m512d   d3   = _mm512_set1_pd(1.0-0.04*seed);
+    __m512d   d4   = _mm512_set1_pd(1.0-0.05*seed);
+    __m512d   d5   = _mm512_set1_pd(1.0-0.06*seed);
+    __m512d   d6   = _mm512_set1_pd(1.0-0.07*seed);
+    __m512d   d7   = _mm512_set1_pd(1.0-0.08*seed);
+    __m512d   d8   = _mm512_set1_pd(1.0-0.09*seed);
+    __m512d   d9   = _mm512_set1_pd(1.0-0.10*seed);
+    __m512d   d10  = _mm512_set1_pd(1.0-0.11*seed);
+    __m512d   d11  = _mm512_set1_pd(1.0-0.12*seed);
+    __m512d   eps  = _mm512_set1_pd(1e-6);
+    __m512i   i0   = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    __m512i   i1   = _mm512_set_epi32(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+    __m512i   i2   = _mm512_set_epi32(1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2);
+    __m512i   i3   = _mm512_set_epi32(2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3);
+    __m512i   i4   = _mm512_set_epi32(3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4);
+    __m512i   i5   = _mm512_set_epi32(4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5);
+    __m512i   i6   = _mm512_set_epi32(5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6);
+    __m512i   i7   = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+    __m512i   i8   = _mm512_set_epi32(8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9);
+    __m512i   i9   = _mm512_set_epi32(9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10);
+    __m512i   i10  = _mm512_set_epi32(10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11);
+    __m512i   i11  = _mm512_set_epi32(11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12);
+    __m512i   idx  = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13);
+    __mmask16 mask = static_cast<uint16_t>(0xffff);
+
+    for (int i = 0; i < loopCount; i++)
+    {
+        d0  = _mm512_fmadd_pd(d0, d0, eps);
+        d1  = _mm512_fmadd_pd(d1, d1, eps);
+        d2  = _mm512_fmadd_pd(d2, d2, eps);
+        d3  = _mm512_fmadd_pd(d3, d3, eps);
+        d4  = _mm512_fmadd_pd(d4, d4, eps);
+        d5  = _mm512_fmadd_pd(d5, d5, eps);
+        d6  = _mm512_fmadd_pd(d6, d6, eps);
+        d7  = _mm512_fmadd_pd(d7, d7, eps);
+        d8  = _mm512_fmadd_pd(d8, d8, eps);
+        d9  = _mm512_fmadd_pd(d9, d9, eps);
+        d10 = _mm512_fmadd_pd(d10, d10, eps);
+        d11 = _mm512_fmadd_pd(d11, d11, eps);
+        // plain permutevar is not yet available in gcc-6.4
+        i0  = _mm512_maskz_permutexvar_epi32(mask, idx, i0);
+        i1  = _mm512_maskz_permutexvar_epi32(mask, idx, i1);
+        i2  = _mm512_maskz_permutexvar_epi32(mask, idx, i2);
+        i3  = _mm512_maskz_permutexvar_epi32(mask, idx, i3);
+        i4  = _mm512_maskz_permutexvar_epi32(mask, idx, i4);
+        i5  = _mm512_maskz_permutexvar_epi32(mask, idx, i5);
+        i6  = _mm512_maskz_permutexvar_epi32(mask, idx, i6);
+        i7  = _mm512_maskz_permutexvar_epi32(mask, idx, i7);
+        i8  = _mm512_maskz_permutexvar_epi32(mask, idx, i8);
+        i9  = _mm512_maskz_permutexvar_epi32(mask, idx, i9);
+        i10 = _mm512_maskz_permutexvar_epi32(mask, idx, i10);
+        i11 = _mm512_maskz_permutexvar_epi32(mask, idx, i11);
+    }
+
+    // Make sure we use all variables in the loop to return a result
+    i0  = _mm512_add_epi32(i0, i1);
+    i2  = _mm512_add_epi32(i2, i3);
+    i4  = _mm512_add_epi32(i4, i5);
+    i6  = _mm512_add_epi32(i6, i7);
+    i8  = _mm512_add_epi32(i8, i9);
+    i10 = _mm512_add_epi32(i10, i11);
+    i0  = _mm512_add_epi32(i0, i2);
+    i4  = _mm512_add_epi32(i4, i6);
+    i8  = _mm512_add_epi32(i8, i10);
+    i0  = _mm512_add_epi32(i0, i4);
+    i0  = _mm512_add_epi32(i0, i8);
+
+    d0 = _mm512_fmadd_pd(d0, d1, d2);
+    d3 = _mm512_fmadd_pd(d3, d4, d5);
+    d6 = _mm512_fmadd_pd(d6, d7, d8);
+    d9 = _mm512_fmadd_pd(d9, d10, d11);
+    d0 = _mm512_add_pd(d0, d3);
+    d6 = _mm512_add_pd(d6, d9);
+    d0 = _mm512_add_pd(d0, d6);
+
+    double data[8];
+    int    idata[16];
+    _mm512_storeu_pd(data, d0);
+    _mm512_storeu_si512(idata, i0);
+
+    double d = 0;
+
+    for (int i = 0; i < 8; i++)
+    {
+        d += data[i] * idata[2*i] * idata[2*i+1];
+    }
+
+    return d;
+}
+
+/*\ brief Loop over FMA AVX512 instructions
+ *
+ * This function executes a meaningless loop that includes only
+ * FMA instructions from the AVX512 instruction set.
+ * We need a bit of complex logic to make sure it cannot be
+ * optimized away by the compiler.
+ *
+ * \param loopCount  Number of iterations. Each iteration will
+ *                   execute 12 FMA instructions.
+ * \param seed       A double-precision number between 0 and 1.
+ *                   To be really certain the loop is not optimized
+ *                   away, you should use some timing-related
+ *                   function to create this seed at runtime.
+ * \return Meaningless floating-point number. Make sure you
+ *         add this number to some variable and conditionally
+ *         issue a print statement e.g. if it is negative
+ *         (which should not happen), again to make sure the loop
+ *         cannot be optimized away.
+ */
+double
+executeFmaOnlyLoop(int    loopCount,
+                   double seed)
+{
+    // Make sure all variables are different to avoid gcc optimizing them away
+    __m512d d0  = _mm512_set1_pd(1.0-0.01*seed);
+    __m512d d1  = _mm512_set1_pd(1.0-0.02*seed);
+    __m512d d2  = _mm512_set1_pd(1.0-0.03*seed);
+    __m512d d3  = _mm512_set1_pd(1.0-0.04*seed);
+    __m512d d4  = _mm512_set1_pd(1.0-0.05*seed);
+    __m512d d5  = _mm512_set1_pd(1.0-0.06*seed);
+    __m512d d6  = _mm512_set1_pd(1.0-0.07*seed);
+    __m512d d7  = _mm512_set1_pd(1.0-0.08*seed);
+    __m512d d8  = _mm512_set1_pd(1.0-0.09*seed);
+    __m512d d9  = _mm512_set1_pd(1.0-0.10*seed);
+    __m512d d10 = _mm512_set1_pd(1.0-0.11*seed);
+    __m512d d11 = _mm512_set1_pd(1.0-0.12*seed);
+    __m512d eps = _mm512_set1_pd(1e-6);
+
+    for (int i = 0; i < loopCount; i++)
+    {
+        d0  = _mm512_fmadd_pd(d0, d0, eps);
+        d1  = _mm512_fmadd_pd(d1, d1, eps);
+        d2  = _mm512_fmadd_pd(d2, d2, eps);
+        d3  = _mm512_fmadd_pd(d3, d3, eps);
+        d4  = _mm512_fmadd_pd(d4, d4, eps);
+        d5  = _mm512_fmadd_pd(d5, d5, eps);
+        d6  = _mm512_fmadd_pd(d6, d6, eps);
+        d7  = _mm512_fmadd_pd(d7, d7, eps);
+        d8  = _mm512_fmadd_pd(d8, d8, eps);
+        d9  = _mm512_fmadd_pd(d9, d9, eps);
+        d10 = _mm512_fmadd_pd(d10, d10, eps);
+        d11 = _mm512_fmadd_pd(d11, d11, eps);
+    }
+
+    // Make sure we use all variables in the loop to return a result
+    d0 = _mm512_fmadd_pd(d0, d1, d2);
+    d3 = _mm512_fmadd_pd(d3, d4, d5);
+    d6 = _mm512_fmadd_pd(d6, d7, d8);
+    d9 = _mm512_fmadd_pd(d9, d10, d11);
+    d0 = _mm512_add_pd(d0, d3);
+    d6 = _mm512_add_pd(d6, d9);
+    d0 = _mm512_add_pd(d0, d6);
+
+    double data[8];
+
+    _mm512_storeu_pd(data, d0);
+
+    double d = 0;
+
+    for (int i = 0; i < 8; i++)
+    {
+        d += data[i];
+    }
+    return d;
+}
+
+int
+checkDualAvx512FmaUnits()
+{
+    uint64_t timeFmaAndShuf = 1e9;             // Large value
+    uint64_t timeFmaOnly    = 1e9;             // Large value
+    double   dummy;
+    double   seed = (rdtscp() & 0xff) / 256.0; // Create an unpredictable small number between 0 and 1
+
+    // Make sure the CPU is in AVX512 mode by executing a fairly long loop
+    dummy = executeFmaOnlyLoop(100000, seed);
+
+    // Execute the mixed FMA/shuffle loop three times
+    for (int i = 0; i < 3; i++)
+    {
+        uint64_t start = rdtscp();
+        dummy += executeFmaAndShuffleLoop(1000, seed);
+        uint64_t res = rdtscp() - start;
+        timeFmaAndShuf = std::min(timeFmaAndShuf, res);
+    }
+
+    // Execute the FMA-only loop three times
+    for (int i = 0; i < 3; i++)
+    {
+        uint64_t start = rdtscp();
+        dummy += executeFmaOnlyLoop(1000, seed);
+        uint64_t res = rdtscp() - start;
+        timeFmaOnly = std::min(timeFmaOnly, res);
+    }
+
+    // Dummy can never be negative, but by using it in the
+    // conditional it cannot be optimized away.
+    return (timeFmaAndShuf > 1.5 * timeFmaOnly || dummy < 0);
+}
+
+
+#endif  // SIMD_AVX_512_CXX_SUPPORTED
+
+/*! \brief Mutex to guard the execution of the timing test
+ *
+ * We only execute the test once, and return the saved result
+ * on subsequent calls.
+ */
+std::mutex initMutex;
+
+}      // namespace anonymous
+
+int
+identifyAvx512FmaUnits()
+{
+    static bool initialized = false;
+    static int  result      = false;
+
+    if (!initialized)
+    {
+        std::lock_guard<std::mutex>  lock(initMutex);
+
+        if (!initialized)
+        {
+            // For the standalone test binary we assume it will
+            // only be executed on AVX512 hardware, but for the
+            // library version we check the hardware support.
+#ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
+            bool haveAvx512Hardware = true;
+#else
+            bool haveAvx512Hardware = CpuInfo::detect().feature(CpuInfo::Feature::X86_Avx512F);
+#endif
+
+            if (haveAvx512Hardware)
+            {
+#if SIMD_AVX_512_CXX_SUPPORTED
+                result = checkDualAvx512FmaUnits() ? 2 : 1;
+#else
+                result = -1; // Cannot run the tests
+#endif
+            }
+            else
+            {
+                result = 0; // Not AVX-512 hardware
+            }
+            initialized = true;
+        }
+    }
+    return result;
+}
+
+} // namespace gmx
+
+#ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
+int
+main()
+{
+    printf("%d\n", gmx::identifyAvx512FmaUnits());
+    return 0;
+}
+#endif
diff --git a/src/gromacs/hardware/identifyavx512fmaunits.h b/src/gromacs/hardware/identifyavx512fmaunits.h
new file mode 100644
index 0000000000..bcd6f13679
--- /dev/null
+++ b/src/gromacs/hardware/identifyavx512fmaunits.h
@@ -0,0 +1,57 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ * \brief Defines a routine to check the number of AVX512 fma units
+ *
+ * \author Erik Lindahl <erik.lindahl@gmail.com>
+ * \inlibraryapi
+ * \ingroup module_hardware
+ */
+
+namespace gmx
+{
+
+/*! \brief Test whether machine has dual AVX512 FMA units
+ *
+ * \return 1 or 2 for the number of AVX512 FMA units if AVX512
+ *         support is present, 0 if we know the hardware does
+ *         not have AVX512 support, or -1 if the test cannot
+ *         run because the compiler lacked AVX512 support.
+ */
+int
+identifyAvx512FmaUnits();
+
+} // namespace gmx
diff --git a/src/gromacs/hardware/printhardware.cpp b/src/gromacs/hardware/printhardware.cpp
index d3b432d424..17b120765b 100644
--- a/src/gromacs/hardware/printhardware.cpp
+++ b/src/gromacs/hardware/printhardware.cpp
@@ -47,6 +47,7 @@
 #include "gromacs/hardware/cpuinfo.h"
 #include "gromacs/hardware/hardwaretopology.h"
 #include "gromacs/hardware/hw_info.h"
+#include "gromacs/hardware/identifyavx512fmaunits.h"
 #include "gromacs/mdtypes/commrec.h"
 #include "gromacs/simd/support.h"
 #include "gromacs/utility/basedefinitions.h"
@@ -227,19 +228,24 @@ static std::string detected_hardware_string(const gmx_hw_info_t *hwinfo,
         s += gmx::formatString("\n");
     }
 
-    s += gmx::formatString("    SIMD instructions most likely to fit this hardware: %s",
-                           gmx::simdString(static_cast<gmx::SimdType>(hwinfo->simd_suggest_min)).c_str());
-
-    if (hwinfo->simd_suggest_max > hwinfo->simd_suggest_min)
+    if (cpuInfo.feature(gmx::CpuInfo::Feature::X86_Avx512F))
     {
-        s += gmx::formatString(" - %s", gmx::simdString(static_cast<gmx::SimdType>(hwinfo->simd_suggest_max)).c_str());
+        int avx512fmaunits = gmx::identifyAvx512FmaUnits();
+        s += gmx::formatString("    Number of AVX-512 FMA units:");
+        if (avx512fmaunits > 0)
+        {
+            s += gmx::formatString(" %d", avx512fmaunits);
+            if (avx512fmaunits == 1)
+            {
+                s += gmx::formatString(" (AVX2 is faster w/o 2 AVX-512 FMA units)");
+            }
+        }
+        else
+        {
+            s += gmx::formatString(" Cannot run AVX-512 detection - assuming 2");
+        }
+        s += gmx::formatString("\n");
     }
-    s += gmx::formatString("\n");
-
-    s += gmx::formatString("    SIMD instructions selected at GROMACS compile time: %s\n",
-                           gmx::simdString(gmx::simdCompiled()).c_str());
-
-    s += gmx::formatString("\n");
 
     s += gmx::formatString("  Hardware topology: ");
     switch (hwTop.supportLevel())
@@ -367,14 +373,9 @@ void gmx_print_detected_hardware(FILE *fplog, const t_commrec *cr,
         fprintf(fplog, "%s\n", detected.c_str());
     }
 
-    if (MULTIMASTER(cr))
-    {
-        std::string detected;
-
-        detected = detected_hardware_string(hwinfo, FALSE);
-
-        fprintf(stderr, "%s\n", detected.c_str());
-    }
+    // Do not spam stderr with all our internal information unless
+    // there was something that actually went wrong; general information
+    // belongs in the logfile.
 
     /* Check the compiled SIMD instruction set against that of the node
      * with the lowest SIMD level support (skip if SIMD detection did not work)
diff --git a/src/gromacs/simd/support.cpp b/src/gromacs/simd/support.cpp
index 3ba4f74d7a..459987ba10 100644
--- a/src/gromacs/simd/support.cpp
+++ b/src/gromacs/simd/support.cpp
@@ -55,6 +55,8 @@
 #include <string>
 
 #include "gromacs/hardware/cpuinfo.h"
+#include "gromacs/hardware/identifyavx512fmaunits.h"
+#include "gromacs/utility/stringutil.h"
 
 namespace gmx
 {
@@ -105,7 +107,8 @@ simdSuggested(const CpuInfo &c)
                 }
                 else if (c.feature(CpuInfo::Feature::X86_Avx512F))
                 {
-                    suggested = SimdType::X86_Avx512;
+                    // If we could not identify the number of AVX512 FMA units we assume 2
+                    suggested = ( identifyAvx512FmaUnits() == 1 ) ? SimdType::X86_Avx2 : SimdType::X86_Avx512;
                 }
                 else if (c.feature(CpuInfo::Feature::X86_Avx2))
                 {
@@ -236,38 +239,83 @@ simdCheck(gmx::SimdType    wanted,
           FILE *           log,
           bool             warnToStdErr)
 {
-    SimdType compiled = simdCompiled();
+    SimdType             compiled  = simdCompiled();
 
-    // Normally it is close to catastrophic if the compiled SIMD type is larger than
-    // the supported one, but AVX128Fma is an exception: AMD CPUs will (strongly) prefer
-    // AVX128Fma, but they will work fine with AVX too. Thus, make an exception for this.
-    if (compiled > wanted && !(compiled == SimdType::X86_Avx && wanted == SimdType::X86_Avx128Fma))
+    gmx::TextLineWrapper wrapper;
+    std::string          logMsg;
+    std::string          warnMsg;
+
+    wrapper.settings().setLineLength(78);
+
+    if (compiled == SimdType::X86_Avx2 && wanted == SimdType::X86_Avx512)
     {
-        fprintf(stderr, "Warning: SIMD instructions newer than hardware. Program will likely crash.\n"
-                "SIMD instructions most likely to fit this hardware: %s\n"
-                "SIMD instructions selected at GROMACS compile time: %s\n\n",
-                simdString(wanted).c_str(),
-                simdString(compiled).c_str());
+        logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
+                                                   "SIMD instructions selected at compile time:       %s\n"
+                                                   "This program was compiled for different hardware than you are running on, "
+                                                   "which could influence performance. This build might have been configured on "
+                                                   "a login node with only a single AVX-512 FMA unit (in which case AVX2 is faster), "
+                                                   "while the node you are running on has dual AVX-512 FMA units.",
+                                                   simdString(wanted).c_str(), simdString(compiled).c_str()));
+        warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
+                                                    simdString(compiled).c_str(), simdString(wanted).c_str()));
+    }
+    else if (compiled == SimdType::X86_Avx512 && wanted == SimdType::X86_Avx2 && identifyAvx512FmaUnits() == 1)
+    {
+        // The reason for explicitly checking the number of FMA units above is to avoid triggering
+        // this conditional if the AVX2 SIMD was requested by some other node in a heterogeneous MPI run.
+        logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
+                                                   "SIMD instructions selected at compile time:       %s\n"
+                                                   "This program was compiled for different hardware than you are running on, "
+                                                   "which could influence performance."
+                                                   "This host supports AVX-512, but since it only has 1 AVX-512"
+                                                   "FMA unit, it would be faster to use AVX2 instead.",
+                                                   simdString(wanted).c_str(), simdString(compiled).c_str()));
+        warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
+                                                    simdString(compiled).c_str(), simdString(wanted).c_str()));
+    }
+    else if (compiled == SimdType::X86_Avx2 && wanted == SimdType::X86_Avx2_128)
+    {
+        logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
+                                                   "SIMD instructions selected at compile time:       %s\n"
+                                                   "This program was compiled for different hardware than you are running on, "
+                                                   "which could influence performance."
+                                                   "Ryzen/Threadripper CPUs support 256-bit AVX2, but 128-bit is faster.",
+                                                   simdString(wanted).c_str(), simdString(compiled).c_str()));
+        warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
+                                                    simdString(compiled).c_str(), simdString(wanted).c_str()));
+    }
+    else if (compiled > wanted && !(compiled == SimdType::X86_Avx && wanted == SimdType::X86_Avx128Fma))
+    {
+        // Normally it is close to catastrophic if the compiled SIMD type is larger than
+        // the supported one, but AVX128Fma is an exception: AMD CPUs will (strongly) prefer
+        // AVX128Fma, but they will work fine with AVX too. Thus, make an exception for this.
+        logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
+                                                   "SIMD instructions selected at compile time:       %s\n"
+                                                   "Compiled SIMD newer than requested; program might crash.",
+                                                   simdString(wanted).c_str(), simdString(compiled).c_str()));
+        warnMsg = logMsg;
     }
     else if (wanted != compiled)
     {
         // This warning will also occur if compiled is X86_Avx and wanted is X86_Avx128Fma
+        logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
+                                                   "SIMD instructions selected at compile time:       %s\n"
+                                                   "This program was compiled for different hardware than you are running on, "
+                                                   "which could influence performance.",
+                                                   simdString(wanted).c_str(), simdString(compiled).c_str()));
+        warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
+                                                    simdString(compiled).c_str(), simdString(wanted).c_str()));
+    }
 
-        if (log != nullptr)
-        {
-            fprintf(log, "\nBinary not matching hardware - you might be losing performance.\n"
-                    "SIMD instructions most likely to fit this hardware: %s\n"
-                    "SIMD instructions selected at GROMACS compile time: %s\n\n",
-                    simdString(wanted).c_str(),
-                    simdString(compiled).c_str());
-        }
-        if (warnToStdErr)
-        {
-            fprintf(stderr, "Compiled SIMD instructions: %s, GROMACS could use %s on this machine, which is better.\n\n",
-                    simdString(compiled).c_str(),
-                    simdString(wanted).c_str());
-        }
+    if (log != nullptr)
+    {
+        fprintf(log, "%s\n", logMsg.c_str());
+    }
+    if (warnToStdErr)
+    {
+        fprintf(stderr, "%s\n", warnMsg.c_str());
     }
+
     return (wanted == compiled);
 }
 
-- 
2.11.4.GIT