From a3a2713014e2cf10e1511a21e5d21012f9d35f46 Mon Sep 17 00:00:00 2001 From: Erik Lindahl Date: Mon, 26 Nov 2012 23:34:31 +0100 Subject: [PATCH] Double precision SSE2 kernels For nonbonded interactions with group cutoff scheme. Change-Id: I2037f94c24126f103fa232e77783775a62821f54 --- src/gmxlib/nonbonded/CMakeLists.txt | 6 +- .../kernelutil_x86_sse2_double.h | 840 +++++ .../make_nb_kernel_sse2_double.py | 489 +++ .../nb_kernel400_sse2_double.c | 329 -- .../nb_kernel400_sse2_double.h | 93 - .../nb_kernel410_sse2_double.c | 377 --- .../nb_kernel410_sse2_double.h | 92 - .../nb_kernel430_sse2_double.c | 473 --- .../nb_kernel430_sse2_double.h | 93 - ...ernel_ElecCSTab_VdwCSTab_GeomP1P1_sse2_double.c | 712 +++++ ...ernel_ElecCSTab_VdwCSTab_GeomW3P1_sse2_double.c | 1190 +++++++ ...ernel_ElecCSTab_VdwCSTab_GeomW3W3_sse2_double.c | 2376 ++++++++++++++ ...ernel_ElecCSTab_VdwCSTab_GeomW4P1_sse2_double.c | 1350 ++++++++ ...ernel_ElecCSTab_VdwCSTab_GeomW4W4_sse2_double.c | 2548 +++++++++++++++ ...b_kernel_ElecCSTab_VdwLJ_GeomP1P1_sse2_double.c | 628 ++++ ...b_kernel_ElecCSTab_VdwLJ_GeomW3P1_sse2_double.c | 1106 +++++++ ...b_kernel_ElecCSTab_VdwLJ_GeomW3W3_sse2_double.c | 2292 ++++++++++++++ ...b_kernel_ElecCSTab_VdwLJ_GeomW4P1_sse2_double.c | 1230 ++++++++ ...b_kernel_ElecCSTab_VdwLJ_GeomW4W4_sse2_double.c | 2428 +++++++++++++++ ...kernel_ElecCSTab_VdwNone_GeomP1P1_sse2_double.c | 557 ++++ ...kernel_ElecCSTab_VdwNone_GeomW3P1_sse2_double.c | 1035 ++++++ ...kernel_ElecCSTab_VdwNone_GeomW3W3_sse2_double.c | 2227 +++++++++++++ ...kernel_ElecCSTab_VdwNone_GeomW4P1_sse2_double.c | 1035 ++++++ ...kernel_ElecCSTab_VdwNone_GeomW4W4_sse2_double.c | 2227 +++++++++++++ ...kernel_ElecCoul_VdwCSTab_GeomP1P1_sse2_double.c | 680 ++++ ...kernel_ElecCoul_VdwCSTab_GeomW3P1_sse2_double.c | 1030 ++++++ ...kernel_ElecCoul_VdwCSTab_GeomW3W3_sse2_double.c | 1832 +++++++++++ ...kernel_ElecCoul_VdwCSTab_GeomW4P1_sse2_double.c | 1158 +++++++ ...kernel_ElecCoul_VdwCSTab_GeomW4W4_sse2_double.c | 1972 ++++++++++++ ...nb_kernel_ElecCoul_VdwLJ_GeomP1P1_sse2_double.c | 546 ++++ ...nb_kernel_ElecCoul_VdwLJ_GeomW3P1_sse2_double.c | 896 ++++++ ...nb_kernel_ElecCoul_VdwLJ_GeomW3W3_sse2_double.c | 1698 ++++++++++ ...nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sse2_double.c | 1024 ++++++ ...nb_kernel_ElecCoul_VdwLJ_GeomW4W4_sse2_double.c | 1838 +++++++++++ ..._kernel_ElecCoul_VdwNone_GeomP1P1_sse2_double.c | 483 +++ ..._kernel_ElecCoul_VdwNone_GeomW3P1_sse2_double.c | 833 +++++ ..._kernel_ElecCoul_VdwNone_GeomW3W3_sse2_double.c | 1641 ++++++++++ ..._kernel_ElecCoul_VdwNone_GeomW4P1_sse2_double.c | 833 +++++ ..._kernel_ElecCoul_VdwNone_GeomW4W4_sse2_double.c | 1641 ++++++++++ ..._kernel_ElecEwSh_VdwLJSh_GeomP1P1_sse2_double.c | 669 ++++ ..._kernel_ElecEwSh_VdwLJSh_GeomW3P1_sse2_double.c | 1193 +++++++ ..._kernel_ElecEwSh_VdwLJSh_GeomW3W3_sse2_double.c | 2517 +++++++++++++++ ..._kernel_ElecEwSh_VdwLJSh_GeomW4P1_sse2_double.c | 1357 ++++++++ ..._kernel_ElecEwSh_VdwLJSh_GeomW4W4_sse2_double.c | 2693 ++++++++++++++++ ..._kernel_ElecEwSh_VdwNone_GeomP1P1_sse2_double.c | 596 ++++ ..._kernel_ElecEwSh_VdwNone_GeomW3P1_sse2_double.c | 1120 +++++++ ..._kernel_ElecEwSh_VdwNone_GeomW3W3_sse2_double.c | 2450 +++++++++++++++ ..._kernel_ElecEwSh_VdwNone_GeomW4P1_sse2_double.c | 1120 +++++++ ..._kernel_ElecEwSh_VdwNone_GeomW4W4_sse2_double.c | 2450 +++++++++++++++ ..._kernel_ElecEwSw_VdwLJSw_GeomP1P1_sse2_double.c | 758 +++++ ..._kernel_ElecEwSw_VdwLJSw_GeomW3P1_sse2_double.c | 1396 +++++++++ ..._kernel_ElecEwSw_VdwLJSw_GeomW3W3_sse2_double.c | 3062 ++++++++++++++++++ ..._kernel_ElecEwSw_VdwLJSw_GeomW4P1_sse2_double.c | 1608 ++++++++++ ..._kernel_ElecEwSw_VdwLJSw_GeomW4W4_sse2_double.c | 3286 ++++++++++++++++++++ ..._kernel_ElecEwSw_VdwNone_GeomP1P1_sse2_double.c | 681 ++++ ..._kernel_ElecEwSw_VdwNone_GeomW3P1_sse2_double.c | 1319 ++++++++ ..._kernel_ElecEwSw_VdwNone_GeomW3W3_sse2_double.c | 2991 ++++++++++++++++++ ..._kernel_ElecEwSw_VdwNone_GeomW4P1_sse2_double.c | 1319 ++++++++ ..._kernel_ElecEwSw_VdwNone_GeomW4W4_sse2_double.c | 2991 ++++++++++++++++++ ...b_kernel_ElecEw_VdwCSTab_GeomP1P1_sse2_double.c | 737 +++++ ...b_kernel_ElecEw_VdwCSTab_GeomW3P1_sse2_double.c | 1185 +++++++ ...b_kernel_ElecEw_VdwCSTab_GeomW3W3_sse2_double.c | 2281 ++++++++++++++ ...b_kernel_ElecEw_VdwCSTab_GeomW4P1_sse2_double.c | 1321 ++++++++ ...b_kernel_ElecEw_VdwCSTab_GeomW4W4_sse2_double.c | 2429 +++++++++++++++ .../nb_kernel_ElecEw_VdwLJ_GeomP1P1_sse2_double.c | 611 ++++ .../nb_kernel_ElecEw_VdwLJ_GeomW3P1_sse2_double.c | 1059 +++++++ .../nb_kernel_ElecEw_VdwLJ_GeomW3W3_sse2_double.c | 2155 +++++++++++++ .../nb_kernel_ElecEw_VdwLJ_GeomW4P1_sse2_double.c | 1187 +++++++ .../nb_kernel_ElecEw_VdwLJ_GeomW4W4_sse2_double.c | 2295 ++++++++++++++ ...nb_kernel_ElecEw_VdwNone_GeomP1P1_sse2_double.c | 548 ++++ ...nb_kernel_ElecEw_VdwNone_GeomW3P1_sse2_double.c | 996 ++++++ ...nb_kernel_ElecEw_VdwNone_GeomW3W3_sse2_double.c | 2098 +++++++++++++ ...nb_kernel_ElecEw_VdwNone_GeomW4P1_sse2_double.c | 996 ++++++ ...nb_kernel_ElecEw_VdwNone_GeomW4W4_sse2_double.c | 2098 +++++++++++++ ...b_kernel_ElecGB_VdwCSTab_GeomP1P1_sse2_double.c | 823 +++++ .../nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse2_double.c | 713 +++++ ...nb_kernel_ElecGB_VdwNone_GeomP1P1_sse2_double.c | 642 ++++ ...kernel_ElecNone_VdwCSTab_GeomP1P1_sse2_double.c | 633 ++++ ..._kernel_ElecNone_VdwLJSh_GeomP1P1_sse2_double.c | 553 ++++ ..._kernel_ElecNone_VdwLJSw_GeomP1P1_sse2_double.c | 637 ++++ ...nb_kernel_ElecNone_VdwLJ_GeomP1P1_sse2_double.c | 499 +++ ...ernel_ElecRFCut_VdwCSTab_GeomP1P1_sse2_double.c | 734 +++++ ...ernel_ElecRFCut_VdwCSTab_GeomW3P1_sse2_double.c | 1156 +++++++ ...ernel_ElecRFCut_VdwCSTab_GeomW3W3_sse2_double.c | 2174 +++++++++++++ ...ernel_ElecRFCut_VdwCSTab_GeomW4P1_sse2_double.c | 1282 ++++++++ ...ernel_ElecRFCut_VdwCSTab_GeomW4W4_sse2_double.c | 2312 ++++++++++++++ ...kernel_ElecRFCut_VdwLJSh_GeomP1P1_sse2_double.c | 608 ++++ ...kernel_ElecRFCut_VdwLJSh_GeomW3P1_sse2_double.c | 1030 ++++++ ...kernel_ElecRFCut_VdwLJSh_GeomW3W3_sse2_double.c | 2048 ++++++++++++ ...kernel_ElecRFCut_VdwLJSh_GeomW4P1_sse2_double.c | 1194 +++++++ ...kernel_ElecRFCut_VdwLJSh_GeomW4W4_sse2_double.c | 2224 +++++++++++++ ...kernel_ElecRFCut_VdwLJSw_GeomP1P1_sse2_double.c | 684 ++++ ...kernel_ElecRFCut_VdwLJSw_GeomW3P1_sse2_double.c | 1106 +++++++ ...kernel_ElecRFCut_VdwLJSw_GeomW3W3_sse2_double.c | 2124 +++++++++++++ ...kernel_ElecRFCut_VdwLJSw_GeomW4P1_sse2_double.c | 1274 ++++++++ ...kernel_ElecRFCut_VdwLJSw_GeomW4W4_sse2_double.c | 2304 ++++++++++++++ ...kernel_ElecRFCut_VdwNone_GeomP1P1_sse2_double.c | 535 ++++ ...kernel_ElecRFCut_VdwNone_GeomW3P1_sse2_double.c | 957 ++++++ ...kernel_ElecRFCut_VdwNone_GeomW3W3_sse2_double.c | 1981 ++++++++++++ ...kernel_ElecRFCut_VdwNone_GeomW4P1_sse2_double.c | 957 ++++++ ...kernel_ElecRFCut_VdwNone_GeomW4W4_sse2_double.c | 1981 ++++++++++++ ...b_kernel_ElecRF_VdwCSTab_GeomP1P1_sse2_double.c | 684 ++++ ...b_kernel_ElecRF_VdwCSTab_GeomW3P1_sse2_double.c | 1030 ++++++ ...b_kernel_ElecRF_VdwCSTab_GeomW3W3_sse2_double.c | 1820 +++++++++++ ...b_kernel_ElecRF_VdwCSTab_GeomW4P1_sse2_double.c | 1158 +++++++ ...b_kernel_ElecRF_VdwCSTab_GeomW4W4_sse2_double.c | 1960 ++++++++++++ .../nb_kernel_ElecRF_VdwLJ_GeomP1P1_sse2_double.c | 550 ++++ .../nb_kernel_ElecRF_VdwLJ_GeomW3P1_sse2_double.c | 896 ++++++ .../nb_kernel_ElecRF_VdwLJ_GeomW3W3_sse2_double.c | 1686 ++++++++++ .../nb_kernel_ElecRF_VdwLJ_GeomW4P1_sse2_double.c | 1024 ++++++ .../nb_kernel_ElecRF_VdwLJ_GeomW4W4_sse2_double.c | 1826 +++++++++++ ...nb_kernel_ElecRF_VdwNone_GeomP1P1_sse2_double.c | 487 +++ ...nb_kernel_ElecRF_VdwNone_GeomW3P1_sse2_double.c | 833 +++++ ...nb_kernel_ElecRF_VdwNone_GeomW3W3_sse2_double.c | 1629 ++++++++++ ...nb_kernel_ElecRF_VdwNone_GeomW4P1_sse2_double.c | 833 +++++ ...nb_kernel_ElecRF_VdwNone_GeomW4W4_sse2_double.c | 1629 ++++++++++ .../nb_kernel_sse2_double/nb_kernel_sse2_double.c | 666 ++-- .../nb_kernel_sse2_double/nb_kernel_sse2_double.h | 83 +- .../nb_kernel_template_sse2_double.pre | 1053 +++++++ src/gmxlib/nonbonded/nonbonded.c | 18 +- 120 files changed, 152733 insertions(+), 1707 deletions(-) create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h create mode 100755 src/gmxlib/nonbonded/nb_kernel_sse2_double/make_nb_kernel_sse2_double.py delete mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel400_sse2_double.c delete mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel400_sse2_double.h delete mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel410_sse2_double.c delete mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel410_sse2_double.h delete mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel430_sse2_double.c delete mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel430_sse2_double.h create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwNone_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwNone_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwNone_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCoul_VdwLJ_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCoul_VdwLJ_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCoul_VdwLJ_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCoul_VdwNone_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCoul_VdwNone_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCoul_VdwNone_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCoul_VdwNone_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCoul_VdwNone_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwNone_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwNone_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSh_VdwNone_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSw_VdwNone_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSw_VdwNone_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEwSw_VdwNone_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwCSTab_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwCSTab_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwCSTab_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJ_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJ_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJ_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJ_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwLJ_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwNone_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwNone_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwNone_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwNone_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecEw_VdwNone_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecGB_VdwCSTab_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecGB_VdwLJ_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecGB_VdwNone_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecNone_VdwCSTab_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecNone_VdwLJSh_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecNone_VdwLJSw_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecNone_VdwLJ_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwNone_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwNone_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRFCut_VdwNone_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRF_VdwCSTab_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRF_VdwCSTab_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRF_VdwCSTab_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRF_VdwLJ_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRF_VdwLJ_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRF_VdwLJ_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRF_VdwLJ_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRF_VdwLJ_GeomW4W4_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRF_VdwNone_GeomP1P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRF_VdwNone_GeomW3P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRF_VdwNone_GeomW3W3_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRF_VdwNone_GeomW4P1_sse2_double.c create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecRF_VdwNone_GeomW4W4_sse2_double.c rewrite src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_sse2_double.c (95%) rewrite src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_sse2_double.h (89%) create mode 100644 src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_template_sse2_double.pre diff --git a/src/gmxlib/nonbonded/CMakeLists.txt b/src/gmxlib/nonbonded/CMakeLists.txt index 1022fb04a0..d24d1beb7e 100644 --- a/src/gmxlib/nonbonded/CMakeLists.txt +++ b/src/gmxlib/nonbonded/CMakeLists.txt @@ -17,8 +17,12 @@ if(GMX_CPU_ACCELERATION STREQUAL "AVX_256" AND NOT GMX_DOUBLE) file(GLOB NONBONDED_AVX_256_SINGLE_SOURCES nb_kernel_avx_256_single/*.c) endif() +if(GMX_CPU_ACCELERATION STREQUAL "SSE2" AND GMX_DOUBLE) + file(GLOB NONBONDED_SSE2_DOUBLE_SOURCES nb_kernel_sse2_double/*.c) +endif() + # These sources will be used in the parent directory's CMakeLists.txt -set(NONBONDED_SOURCES ${NONBONDED_SOURCES} ${NONBONDED_SSE2_SINGLE_SOURCES} ${NONBONDED_SSE4_1_SINGLE_SOURCES} ${NONBONDED_AVX_128_FMA_SINGLE_SOURCES} ${NONBONDED_AVX_256_SINGLE_SOURCES} PARENT_SCOPE) +set(NONBONDED_SOURCES ${NONBONDED_SOURCES} ${NONBONDED_SSE2_SINGLE_SOURCES} ${NONBONDED_SSE4_1_SINGLE_SOURCES} ${NONBONDED_AVX_128_FMA_SINGLE_SOURCES} ${NONBONDED_AVX_256_SINGLE_SOURCES} ${NONBONDED_SSE2_DOUBLE_SOURCES} PARENT_SCOPE) diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h b/src/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h new file mode 100644 index 0000000000..006439173d --- /dev/null +++ b/src/gmxlib/nonbonded/nb_kernel_sse2_double/kernelutil_x86_sse2_double.h @@ -0,0 +1,840 @@ +/* + * This source code is part of + * + * G R O M A C S + * + * Copyright (c) 2011-2012, The GROMACS Development Team + * + * Gromacs is a library for molecular simulation and trajectory analysis, + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for + * a full list of developers and information, check out http://www.gromacs.org + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) any + * later version. + * As a special exception, you may use this file as part of a free software + * library without restriction. Specifically, if other files instantiate + * templates or use macros or inline functions from this file, or you compile + * this file and link it with other files to produce an executable, this + * file does not by itself cause the resulting executable to be covered by + * the GNU Lesser General Public License. + * + * In plain-speak: do not worry about classes/macros/templates either - only + * changes to the library have to be LGPL, not an application linking with it. + * + * To help fund GROMACS development, we humbly ask that you cite + * the papers people have written on it - you can find them on the website! + */ +#ifndef _kernelutil_x86_sse2_double_h_ +#define _kernelutil_x86_sse2_double_h_ + +#include + +#include "gmx_x86_sse2.h" + +#include + + +/* Normal sum of four ymm registers */ +#define gmx_mm_sum4_pd(t0,t1,t2,t3) _mm_add_pd(_mm_add_pd(t0,t1),_mm_add_pd(t2,t3)) + +static int +gmx_mm_any_lt(__m128d a, __m128d b) +{ + return _mm_movemask_pd(_mm_cmplt_pd(a,b)); +} + +static gmx_inline __m128d +gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz) +{ + return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx), _mm_mul_pd(dy,dy) ), _mm_mul_pd(dz,dz) ); +} + + +/* Load a double value from 1-2 places, merge into xmm register */ +static gmx_inline __m128d +gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA, + const double * gmx_restrict ptrB) +{ + return _mm_unpacklo_pd(_mm_load_sd(ptrA),_mm_load_sd(ptrB)); +} + +static gmx_inline __m128d +gmx_mm_load_1real_pd(const double * gmx_restrict ptrA) +{ + return _mm_load_sd(ptrA); +} + + +static gmx_inline void +gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA, + double * gmx_restrict ptrB, + __m128d xmm1) +{ + __m128d t2; + + t2 = _mm_unpackhi_pd(xmm1,xmm1); + _mm_store_sd(ptrA,xmm1); + _mm_store_sd(ptrB,t2); +} + +static gmx_inline void +gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1) +{ + _mm_store_sd(ptrA,xmm1); +} + + +/* Similar to store, but increments value in memory */ +static gmx_inline void +gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA, + double * gmx_restrict ptrB, __m128d xmm1) +{ + __m128d t1; + + t1 = _mm_unpackhi_pd(xmm1,xmm1); + xmm1 = _mm_add_sd(xmm1,_mm_load_sd(ptrA)); + t1 = _mm_add_sd(t1,_mm_load_sd(ptrB)); + _mm_store_sd(ptrA,xmm1); + _mm_store_sd(ptrB,t1); +} + +static gmx_inline void +gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1) +{ + __m128d tmp; + + tmp = gmx_mm_load_1real_pd(ptrA); + tmp = _mm_add_sd(tmp,xmm1); + gmx_mm_store_1real_pd(ptrA,tmp); +} + + +static gmx_inline void +gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1, + const double * gmx_restrict p2, + __m128d * gmx_restrict c6, + __m128d * gmx_restrict c12) +{ + __m128d t1,t2,t3; + + t1 = _mm_loadu_pd(p1); + t2 = _mm_loadu_pd(p2); + *c6 = _mm_unpacklo_pd(t1,t2); + *c12 = _mm_unpackhi_pd(t1,t2); +} + +static gmx_inline void +gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1, + __m128d * gmx_restrict c6, + __m128d * gmx_restrict c12) +{ + *c6 = _mm_load_sd(p1); + *c12 = _mm_load_sd(p1+1); +} + + + +static gmx_inline void +gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift, + const double * gmx_restrict xyz, + __m128d * gmx_restrict x1, + __m128d * gmx_restrict y1, + __m128d * gmx_restrict z1) +{ + __m128d mem_xy,mem_z,mem_sxy,mem_sz; + + mem_xy = _mm_loadu_pd(xyz); + mem_z = _mm_load_sd(xyz+2); + mem_sxy = _mm_loadu_pd(xyz_shift); + mem_sz = _mm_load_sd(xyz_shift+2); + + mem_xy = _mm_add_pd(mem_xy,mem_sxy); + mem_z = _mm_add_pd(mem_z,mem_sz); + + *x1 = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(0,0)); + *y1 = _mm_shuffle_pd(mem_xy,mem_xy,_MM_SHUFFLE2(1,1)); + *z1 = _mm_shuffle_pd(mem_z,mem_z,_MM_SHUFFLE2(0,0)); +} + + +static gmx_inline void +gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift, + const double * gmx_restrict xyz, + __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1, + __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2, + __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3) +{ + __m128d t1,t2,t3,t4,t5,sxy,sz,szx,syz; + + t1 = _mm_loadu_pd(xyz); + t2 = _mm_loadu_pd(xyz+2); + t3 = _mm_loadu_pd(xyz+4); + t4 = _mm_loadu_pd(xyz+6); + t5 = _mm_load_sd(xyz+8); + + sxy = _mm_loadu_pd(xyz_shift); + sz = _mm_load_sd(xyz_shift+2); + szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0)); + syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1)); + + t1 = _mm_add_pd(t1,sxy); + t2 = _mm_add_pd(t2,szx); + t3 = _mm_add_pd(t3,syz); + t4 = _mm_add_pd(t4,sxy); + t5 = _mm_add_sd(t5,sz); + + *x1 = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0)); + *y1 = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1)); + *z1 = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0)); + *x2 = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1)); + *y2 = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0)); + *z2 = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1)); + *x3 = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0)); + *y3 = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1)); + *z3 = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0)); +} + + +static gmx_inline void +gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift, + const double * gmx_restrict xyz, + __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1, + __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2, + __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3, + __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4) +{ + __m128d t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz; + + t1 = _mm_loadu_pd(xyz); + t2 = _mm_loadu_pd(xyz+2); + t3 = _mm_loadu_pd(xyz+4); + t4 = _mm_loadu_pd(xyz+6); + t5 = _mm_loadu_pd(xyz+8); + t6 = _mm_loadu_pd(xyz+10); + + sxy = _mm_loadu_pd(xyz_shift); + sz = _mm_load_sd(xyz_shift+2); + szx = _mm_shuffle_pd(sz,sxy,_MM_SHUFFLE2(0,0)); + syz = _mm_shuffle_pd(sxy,sz,_MM_SHUFFLE2(0,1)); + + t1 = _mm_add_pd(t1,sxy); + t2 = _mm_add_pd(t2,szx); + t3 = _mm_add_pd(t3,syz); + t4 = _mm_add_pd(t4,sxy); + t5 = _mm_add_pd(t5,szx); + t6 = _mm_add_pd(t6,syz); + + *x1 = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(0,0)); + *y1 = _mm_shuffle_pd(t1,t1,_MM_SHUFFLE2(1,1)); + *z1 = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(0,0)); + *x2 = _mm_shuffle_pd(t2,t2,_MM_SHUFFLE2(1,1)); + *y2 = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(0,0)); + *z2 = _mm_shuffle_pd(t3,t3,_MM_SHUFFLE2(1,1)); + *x3 = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(0,0)); + *y3 = _mm_shuffle_pd(t4,t4,_MM_SHUFFLE2(1,1)); + *z3 = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(0,0)); + *x4 = _mm_shuffle_pd(t5,t5,_MM_SHUFFLE2(1,1)); + *y4 = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(0,0)); + *z4 = _mm_shuffle_pd(t6,t6,_MM_SHUFFLE2(1,1)); +} + + + + +static gmx_inline void +gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1, + __m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z) +{ + *x = _mm_load_sd(p1); + *y = _mm_load_sd(p1+1); + *z = _mm_load_sd(p1+2); +} + +static gmx_inline void +gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1, + __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1, + __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2, + __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3) +{ + *x1 = _mm_load_sd(p1); + *y1 = _mm_load_sd(p1+1); + *z1 = _mm_load_sd(p1+2); + *x2 = _mm_load_sd(p1+3); + *y2 = _mm_load_sd(p1+4); + *z2 = _mm_load_sd(p1+5); + *x3 = _mm_load_sd(p1+6); + *y3 = _mm_load_sd(p1+7); + *z3 = _mm_load_sd(p1+8); +} + +static gmx_inline void +gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1, + __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1, + __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2, + __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3, + __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4) +{ + *x1 = _mm_load_sd(p1); + *y1 = _mm_load_sd(p1+1); + *z1 = _mm_load_sd(p1+2); + *x2 = _mm_load_sd(p1+3); + *y2 = _mm_load_sd(p1+4); + *z2 = _mm_load_sd(p1+5); + *x3 = _mm_load_sd(p1+6); + *y3 = _mm_load_sd(p1+7); + *z3 = _mm_load_sd(p1+8); + *x4 = _mm_load_sd(p1+9); + *y4 = _mm_load_sd(p1+10); + *z4 = _mm_load_sd(p1+11); +} + + +static gmx_inline void +gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, + const double * gmx_restrict ptrB, + __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1) +{ + __m128d t1,t2,t3,t4; + t1 = _mm_loadu_pd(ptrA); + t2 = _mm_loadu_pd(ptrB); + t3 = _mm_load_sd(ptrA+2); + t4 = _mm_load_sd(ptrB+2); + GMX_MM_TRANSPOSE2_PD(t1,t2); + *x1 = t1; + *y1 = t2; + *z1 = _mm_unpacklo_pd(t3,t4); +} + + +static gmx_inline void +gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB, + __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1, + __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2, + __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3) +{ + __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10; + t1 = _mm_loadu_pd(ptrA); + t2 = _mm_loadu_pd(ptrB); + t3 = _mm_loadu_pd(ptrA+2); + t4 = _mm_loadu_pd(ptrB+2); + t5 = _mm_loadu_pd(ptrA+4); + t6 = _mm_loadu_pd(ptrB+4); + t7 = _mm_loadu_pd(ptrA+6); + t8 = _mm_loadu_pd(ptrB+6); + t9 = _mm_load_sd(ptrA+8); + t10 = _mm_load_sd(ptrB+8); + GMX_MM_TRANSPOSE2_PD(t1,t2); + GMX_MM_TRANSPOSE2_PD(t3,t4); + GMX_MM_TRANSPOSE2_PD(t5,t6); + GMX_MM_TRANSPOSE2_PD(t7,t8); + *x1 = t1; + *y1 = t2; + *z1 = t3; + *x2 = t4; + *y2 = t5; + *z2 = t6; + *x3 = t7; + *y3 = t8; + *z3 = _mm_unpacklo_pd(t9,t10); +} + + +static gmx_inline void +gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB, + __m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1, + __m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2, + __m128d * gmx_restrict x3, __m128d * gmx_restrict y3, __m128d * gmx_restrict z3, + __m128d * gmx_restrict x4, __m128d * gmx_restrict y4, __m128d * gmx_restrict z4) +{ + __m128d t1,t2,t3,t4,t5,t6; + t1 = _mm_loadu_pd(ptrA); + t2 = _mm_loadu_pd(ptrB); + t3 = _mm_loadu_pd(ptrA+2); + t4 = _mm_loadu_pd(ptrB+2); + t5 = _mm_loadu_pd(ptrA+4); + t6 = _mm_loadu_pd(ptrB+4); + GMX_MM_TRANSPOSE2_PD(t1,t2); + GMX_MM_TRANSPOSE2_PD(t3,t4); + GMX_MM_TRANSPOSE2_PD(t5,t6); + *x1 = t1; + *y1 = t2; + *z1 = t3; + *x2 = t4; + *y2 = t5; + *z2 = t6; + t1 = _mm_loadu_pd(ptrA+6); + t2 = _mm_loadu_pd(ptrB+6); + t3 = _mm_loadu_pd(ptrA+8); + t4 = _mm_loadu_pd(ptrB+8); + t5 = _mm_loadu_pd(ptrA+10); + t6 = _mm_loadu_pd(ptrB+10); + GMX_MM_TRANSPOSE2_PD(t1,t2); + GMX_MM_TRANSPOSE2_PD(t3,t4); + GMX_MM_TRANSPOSE2_PD(t5,t6); + *x3 = t1; + *y3 = t2; + *z3 = t3; + *x4 = t4; + *y4 = t5; + *z4 = t6; +} + + +/* Routines to decrement rvec in memory, typically use for j particle force updates */ +static gmx_inline void +gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA, + __m128d xy, __m128d z) +{ + __m128d t1,t2; + + t1 = _mm_loadu_pd(ptrA); + t2 = _mm_load_sd(ptrA+2); + + t1 = _mm_sub_pd(t1,xy); + t2 = _mm_sub_sd(t2,z); + + _mm_storeu_pd(ptrA,t1); + _mm_store_sd(ptrA+2,t2); +} + +static gmx_inline void +gmx_mm_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA, + __m128d xy1, __m128d z1, + __m128d xy2, __m128d z2, + __m128d xy3, __m128d z3) +{ + __m128d t1,t2; + __m128d tA,tB,tC,tD,tE; + + tA = _mm_loadu_pd(ptrA); + tB = _mm_loadu_pd(ptrA+2); + tC = _mm_loadu_pd(ptrA+4); + tD = _mm_loadu_pd(ptrA+6); + tE = _mm_load_sd(ptrA+8); + + /* xy1: y1 x1 */ + t1 = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,1)); /* x2 z1 */ + t2 = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */ + /* xy3: y3 x3 */ + + tA = _mm_sub_pd(tA,xy1); + tB = _mm_sub_pd(tB,t1); + tC = _mm_sub_pd(tC,t2); + tD = _mm_sub_pd(tD,xy3); + tE = _mm_sub_sd(tE,z3); + + _mm_storeu_pd(ptrA,tA); + _mm_storeu_pd(ptrA+2,tB); + _mm_storeu_pd(ptrA+4,tC); + _mm_storeu_pd(ptrA+6,tD); + _mm_store_sd(ptrA+8,tE); +} + +static gmx_inline void +gmx_mm_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA, + __m128d xy1, __m128d z1, + __m128d xy2, __m128d z2, + __m128d xy3, __m128d z3, + __m128d xy4, __m128d z4) +{ + __m128d t1,t2,t3,t4; + __m128d tA,tB,tC,tD,tE,tF; + + tA = _mm_loadu_pd(ptrA); + tB = _mm_loadu_pd(ptrA+2); + tC = _mm_loadu_pd(ptrA+4); + tD = _mm_loadu_pd(ptrA+6); + tE = _mm_loadu_pd(ptrA+8); + tF = _mm_loadu_pd(ptrA+10); + + /* xy1: y1 x1 */ + t1 = _mm_shuffle_pd(z1,xy2,_MM_SHUFFLE2(0,0)); /* x2 z1 */ + t2 = _mm_shuffle_pd(xy2,z2,_MM_SHUFFLE2(0,1)); /* z2 y2 */ + /* xy3: y3 x3 */ + t3 = _mm_shuffle_pd(z3,xy4,_MM_SHUFFLE2(0,0)); /* x4 z3 */ + t4 = _mm_shuffle_pd(xy4,z4,_MM_SHUFFLE2(0,1)); /* z4 y4 */ + + tA = _mm_sub_pd(tA,xy1); + tB = _mm_sub_pd(tB,t1); + tC = _mm_sub_pd(tC,t2); + tD = _mm_sub_pd(tD,xy3); + tE = _mm_sub_pd(tE,t3); + tF = _mm_sub_pd(tF,t4); + + _mm_storeu_pd(ptrA,tA); + _mm_storeu_pd(ptrA+2,tB); + _mm_storeu_pd(ptrA+4,tC); + _mm_storeu_pd(ptrA+6,tD); + _mm_storeu_pd(ptrA+8,tE); + _mm_storeu_pd(ptrA+10,tF); +} + +static gmx_inline void +gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA, + __m128d x1, __m128d y1, __m128d z1) +{ + __m128d t1,t2,t3; + + t1 = _mm_load_sd(ptrA); + t2 = _mm_load_sd(ptrA+1); + t3 = _mm_load_sd(ptrA+2); + + t1 = _mm_sub_sd(t1,x1); + t2 = _mm_sub_sd(t2,y1); + t3 = _mm_sub_sd(t3,z1); + _mm_store_sd(ptrA,t1); + _mm_store_sd(ptrA+1,t2); + _mm_store_sd(ptrA+2,t3); +} + + +static gmx_inline void +gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA, + __m128d x1, __m128d y1, __m128d z1, + __m128d x2, __m128d y2, __m128d z2, + __m128d x3, __m128d y3, __m128d z3) +{ + __m128d t1,t2,t3,t4,t5; + + t1 = _mm_loadu_pd(ptrA); + t2 = _mm_loadu_pd(ptrA+2); + t3 = _mm_loadu_pd(ptrA+4); + t4 = _mm_loadu_pd(ptrA+6); + t5 = _mm_load_sd(ptrA+8); + + x1 = _mm_unpacklo_pd(x1,y1); + z1 = _mm_unpacklo_pd(z1,x2); + y2 = _mm_unpacklo_pd(y2,z2); + x3 = _mm_unpacklo_pd(x3,y3); + /* nothing to be done for z3 */ + + t1 = _mm_sub_pd(t1,x1); + t2 = _mm_sub_pd(t2,z1); + t3 = _mm_sub_pd(t3,y2); + t4 = _mm_sub_pd(t4,x3); + t5 = _mm_sub_sd(t5,z3); + _mm_storeu_pd(ptrA,t1); + _mm_storeu_pd(ptrA+2,t2); + _mm_storeu_pd(ptrA+4,t3); + _mm_storeu_pd(ptrA+6,t4); + _mm_store_sd(ptrA+8,t5); +} + + +static gmx_inline void +gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA, + __m128d x1, __m128d y1, __m128d z1, + __m128d x2, __m128d y2, __m128d z2, + __m128d x3, __m128d y3, __m128d z3, + __m128d x4, __m128d y4, __m128d z4) +{ + __m128d t1,t2,t3,t4,t5,t6; + + t1 = _mm_loadu_pd(ptrA); + t2 = _mm_loadu_pd(ptrA+2); + t3 = _mm_loadu_pd(ptrA+4); + t4 = _mm_loadu_pd(ptrA+6); + t5 = _mm_loadu_pd(ptrA+8); + t6 = _mm_loadu_pd(ptrA+10); + + x1 = _mm_unpacklo_pd(x1,y1); + z1 = _mm_unpacklo_pd(z1,x2); + y2 = _mm_unpacklo_pd(y2,z2); + x3 = _mm_unpacklo_pd(x3,y3); + z3 = _mm_unpacklo_pd(z3,x4); + y4 = _mm_unpacklo_pd(y4,z4); + + _mm_storeu_pd(ptrA, _mm_sub_pd( t1,x1 )); + _mm_storeu_pd(ptrA+2, _mm_sub_pd( t2,z1 )); + _mm_storeu_pd(ptrA+4, _mm_sub_pd( t3,y2 )); + _mm_storeu_pd(ptrA+6, _mm_sub_pd( t4,x3 )); + _mm_storeu_pd(ptrA+8, _mm_sub_pd( t5,z3 )); + _mm_storeu_pd(ptrA+10, _mm_sub_pd( t6,y4 )); +} + +static gmx_inline void +gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, + __m128d x1, __m128d y1, __m128d z1) +{ + __m128d t1,t2,t3,t4,t5,t6,t7; + + t1 = _mm_loadu_pd(ptrA); + t2 = _mm_load_sd(ptrA+2); + t3 = _mm_loadu_pd(ptrB); + t4 = _mm_load_sd(ptrB+2); + + t5 = _mm_unpacklo_pd(x1,y1); + t6 = _mm_unpackhi_pd(x1,y1); + t7 = _mm_unpackhi_pd(z1,z1); + + t1 = _mm_sub_pd(t1,t5); + t2 = _mm_sub_sd(t2,z1); + + t3 = _mm_sub_pd(t3,t6); + t4 = _mm_sub_sd(t4,t7); + + _mm_storeu_pd(ptrA,t1); + _mm_store_sd(ptrA+2,t2); + _mm_storeu_pd(ptrB,t3); + _mm_store_sd(ptrB+2,t4); +} + +static gmx_inline void +gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, + __m128d x1, __m128d y1, __m128d z1, + __m128d x2, __m128d y2, __m128d z2, + __m128d x3, __m128d y3, __m128d z3) +{ + __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10; + __m128d tA,tB,tC,tD,tE,tF,tG,tH,tI; + + t1 = _mm_loadu_pd(ptrA); + t2 = _mm_loadu_pd(ptrA+2); + t3 = _mm_loadu_pd(ptrA+4); + t4 = _mm_loadu_pd(ptrA+6); + t5 = _mm_load_sd(ptrA+8); + t6 = _mm_loadu_pd(ptrB); + t7 = _mm_loadu_pd(ptrB+2); + t8 = _mm_loadu_pd(ptrB+4); + t9 = _mm_loadu_pd(ptrB+6); + t10 = _mm_load_sd(ptrB+8); + + tA = _mm_unpacklo_pd(x1,y1); + tB = _mm_unpackhi_pd(x1,y1); + tC = _mm_unpacklo_pd(z1,x2); + tD = _mm_unpackhi_pd(z1,x2); + tE = _mm_unpacklo_pd(y2,z2); + tF = _mm_unpackhi_pd(y2,z2); + tG = _mm_unpacklo_pd(x3,y3); + tH = _mm_unpackhi_pd(x3,y3); + tI = _mm_unpackhi_pd(z3,z3); + + t1 = _mm_sub_pd(t1,tA); + t2 = _mm_sub_pd(t2,tC); + t3 = _mm_sub_pd(t3,tE); + t4 = _mm_sub_pd(t4,tG); + t5 = _mm_sub_sd(t5,z3); + + t6 = _mm_sub_pd(t6,tB); + t7 = _mm_sub_pd(t7,tD); + t8 = _mm_sub_pd(t8,tF); + t9 = _mm_sub_pd(t9,tH); + t10 = _mm_sub_sd(t10,tI); + + _mm_storeu_pd(ptrA,t1); + _mm_storeu_pd(ptrA+2,t2); + _mm_storeu_pd(ptrA+4,t3); + _mm_storeu_pd(ptrA+6,t4); + _mm_store_sd(ptrA+8,t5); + _mm_storeu_pd(ptrB,t6); + _mm_storeu_pd(ptrB+2,t7); + _mm_storeu_pd(ptrB+4,t8); + _mm_storeu_pd(ptrB+6,t9); + _mm_store_sd(ptrB+8,t10); +} + + +static gmx_inline void +gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, + __m128d x1, __m128d y1, __m128d z1, + __m128d x2, __m128d y2, __m128d z2, + __m128d x3, __m128d y3, __m128d z3, + __m128d x4, __m128d y4, __m128d z4) +{ + __m128d t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12; + __m128d tA,tB,tC,tD,tE,tF,tG,tH,tI,tJ,tK,tL; + + t1 = _mm_loadu_pd(ptrA); + t2 = _mm_loadu_pd(ptrA+2); + t3 = _mm_loadu_pd(ptrA+4); + t4 = _mm_loadu_pd(ptrA+6); + t5 = _mm_loadu_pd(ptrA+8); + t6 = _mm_loadu_pd(ptrA+10); + t7 = _mm_loadu_pd(ptrB); + t8 = _mm_loadu_pd(ptrB+2); + t9 = _mm_loadu_pd(ptrB+4); + t10 = _mm_loadu_pd(ptrB+6); + t11 = _mm_loadu_pd(ptrB+8); + t12 = _mm_loadu_pd(ptrB+10); + + tA = _mm_unpacklo_pd(x1,y1); + tB = _mm_unpackhi_pd(x1,y1); + tC = _mm_unpacklo_pd(z1,x2); + tD = _mm_unpackhi_pd(z1,x2); + tE = _mm_unpacklo_pd(y2,z2); + tF = _mm_unpackhi_pd(y2,z2); + tG = _mm_unpacklo_pd(x3,y3); + tH = _mm_unpackhi_pd(x3,y3); + tI = _mm_unpacklo_pd(z3,x4); + tJ = _mm_unpackhi_pd(z3,x4); + tK = _mm_unpacklo_pd(y4,z4); + tL = _mm_unpackhi_pd(y4,z4); + + t1 = _mm_sub_pd(t1,tA); + t2 = _mm_sub_pd(t2,tC); + t3 = _mm_sub_pd(t3,tE); + t4 = _mm_sub_pd(t4,tG); + t5 = _mm_sub_pd(t5,tI); + t6 = _mm_sub_pd(t6,tK); + + t7 = _mm_sub_pd(t7,tB); + t8 = _mm_sub_pd(t8,tD); + t9 = _mm_sub_pd(t9,tF); + t10 = _mm_sub_pd(t10,tH); + t11 = _mm_sub_pd(t11,tJ); + t12 = _mm_sub_pd(t12,tL); + + _mm_storeu_pd(ptrA, t1); + _mm_storeu_pd(ptrA+2,t2); + _mm_storeu_pd(ptrA+4,t3); + _mm_storeu_pd(ptrA+6,t4); + _mm_storeu_pd(ptrA+8,t5); + _mm_storeu_pd(ptrA+10,t6); + _mm_storeu_pd(ptrB, t7); + _mm_storeu_pd(ptrB+2,t8); + _mm_storeu_pd(ptrB+4,t9); + _mm_storeu_pd(ptrB+6,t10); + _mm_storeu_pd(ptrB+8,t11); + _mm_storeu_pd(ptrB+10,t12); +} + + + + + +static gmx_inline void +gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1, + double * gmx_restrict fptr, + double * gmx_restrict fshiftptr) +{ + __m128d t1,t2,t3; + + /* transpose data */ + t1 = fix1; + fix1 = _mm_unpacklo_pd(fix1,fiy1); /* y0 x0 */ + fiy1 = _mm_unpackhi_pd(t1,fiy1); /* y1 x1 */ + + fix1 = _mm_add_pd(fix1,fiy1); + fiz1 = _mm_add_sd( fiz1, _mm_unpackhi_pd(fiz1,fiz1 )); + + _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); + _mm_store_sd( fptr+2, _mm_add_sd( _mm_load_sd(fptr+2), fiz1 )); + + _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); + _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); +} + +static gmx_inline void +gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1, + __m128d fix2, __m128d fiy2, __m128d fiz2, + __m128d fix3, __m128d fiy3, __m128d fiz3, + double * gmx_restrict fptr, + double * gmx_restrict fshiftptr) +{ + __m128d t1,t2; + + /* transpose data */ + GMX_MM_TRANSPOSE2_PD(fix1,fiy1); + GMX_MM_TRANSPOSE2_PD(fiz1,fix2); + GMX_MM_TRANSPOSE2_PD(fiy2,fiz2); + t1 = fix3; + fix3 = _mm_unpacklo_pd(fix3,fiy3); /* y0 x0 */ + fiy3 = _mm_unpackhi_pd(t1,fiy3); /* y1 x1 */ + + fix1 = _mm_add_pd(fix1,fiy1); + fiz1 = _mm_add_pd(fiz1,fix2); + fiy2 = _mm_add_pd(fiy2,fiz2); + + fix3 = _mm_add_pd(fix3,fiy3); + fiz3 = _mm_add_sd( fiz3, _mm_unpackhi_pd(fiz3,fiz3)); + + _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); + _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); + _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); + _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); + _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); + + fix1 = _mm_add_pd(fix1,fix3); + t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1)); + fix1 = _mm_add_pd(fix1,t1); /* x and y sums */ + + t2 = _mm_shuffle_pd(fiy2,fiy2,_MM_SHUFFLE2(1,1)); + fiz1 = _mm_add_sd(fiz1,fiz3); + fiz1 = _mm_add_sd(fiz1,t2); /* z sum */ + + _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); + _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); +} + + +static gmx_inline void +gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1, + __m128d fix2, __m128d fiy2, __m128d fiz2, + __m128d fix3, __m128d fiy3, __m128d fiz3, + __m128d fix4, __m128d fiy4, __m128d fiz4, + double * gmx_restrict fptr, + double * gmx_restrict fshiftptr) +{ + __m128d t1,t2; + + /* transpose data */ + GMX_MM_TRANSPOSE2_PD(fix1,fiy1); + GMX_MM_TRANSPOSE2_PD(fiz1,fix2); + GMX_MM_TRANSPOSE2_PD(fiy2,fiz2); + GMX_MM_TRANSPOSE2_PD(fix3,fiy3); + GMX_MM_TRANSPOSE2_PD(fiz3,fix4); + GMX_MM_TRANSPOSE2_PD(fiy4,fiz4); + + fix1 = _mm_add_pd(fix1,fiy1); + fiz1 = _mm_add_pd(fiz1,fix2); + fiy2 = _mm_add_pd(fiy2,fiz2); + fix3 = _mm_add_pd(fix3,fiy3); + fiz3 = _mm_add_pd(fiz3,fix4); + fiy4 = _mm_add_pd(fiy4,fiz4); + + _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); + _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); + _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); + _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); + _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 )); + _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); + + t1 = _mm_shuffle_pd(fiz1,fiy2,_MM_SHUFFLE2(0,1)); + fix1 = _mm_add_pd(fix1,t1); + t2 = _mm_shuffle_pd(fiz3,fiy4,_MM_SHUFFLE2(0,1)); + fix3 = _mm_add_pd(fix3,t2); + fix1 = _mm_add_pd(fix1,fix3); /* x and y sums */ + + fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2,fiy2)); + fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4,fiy4)); + fiz1 = _mm_add_sd(fiz1,fiz3); /* z sum */ + + _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); + _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); +} + + + +static gmx_inline void +gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA) +{ + pot1 = _mm_add_pd(pot1, _mm_unpackhi_pd(pot1,pot1)); + _mm_store_sd(ptrA,_mm_add_sd(pot1,_mm_load_sd(ptrA))); +} + +static gmx_inline void +gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA, + __m128d pot2, double * gmx_restrict ptrB) +{ + GMX_MM_TRANSPOSE2_PD(pot1,pot2); + pot1 = _mm_add_pd(pot1,pot2); + pot2 = _mm_unpackhi_pd(pot1,pot1); + + _mm_store_sd(ptrA,_mm_add_sd(pot1,_mm_load_sd(ptrA))); + _mm_store_sd(ptrB,_mm_add_sd(pot2,_mm_load_sd(ptrB))); +} + + +#endif /* _kernelutil_x86_sse2_double_h_ */ diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_double/make_nb_kernel_sse2_double.py b/src/gmxlib/nonbonded/nb_kernel_sse2_double/make_nb_kernel_sse2_double.py new file mode 100755 index 0000000000..98541ff404 --- /dev/null +++ b/src/gmxlib/nonbonded/nb_kernel_sse2_double/make_nb_kernel_sse2_double.py @@ -0,0 +1,489 @@ +#!/usr/bin/python + +import sys +import os +sys.path.append ( "../preprocessor" ) +from gmxpreprocess import gmxpreprocess + +# "The happiest programs are programs that write other programs." +# +# +# This script controls the generation of Gromacs nonbonded kernels. +# +# We no longer generate kernels on-the-fly, so this file is not run +# during a Gromacs compile - only when we need to update the kernels (=rarely). +# +# To maximize performance, each combination of interactions in Gromacs +# has a separate nonbonded kernel without conditionals in the code. +# To avoid writing hundreds of different routines for each architecture, +# we instead use a custom preprocessor so we can encode the conditionals +# and expand for-loops (e.g, for water-water interactions) +# from a general kernel template. While that file will contain quite a +# few preprocessor directives, it is still an order of magnitude easier +# to maintain than ~200 different kernels (not to mention it avoids bugs). +# +# To actually generate the kernels, this program iteratively calls the +# preprocessor with different define settings corresponding to all +# combinations of coulomb/van-der-Waals/geometry options. +# +# A main goal in the design was to make this new generator _general_. For +# this reason we have used a lot of different fields to identify a particular +# kernel and interaction. Basically, each kernel will have a name like +# +# nbkernel_ElecXX_VdwYY_GeomZZ_VF_QQ() +# +# Where XX/YY/ZZ/VF are strings to identify what the kernel computes. +# +# Elec/Vdw describe the type of interaction for electrostatics and van der Waals. +# The geometry settings correspond e.g. to water-water or water-particle kernels, +# and finally the VF setting is V,F,or VF depending on whether we calculate +# only the potential, only the force, or both of them. The final string (QQ) +# is the architecture/language/optimization of the kernel. +# +Arch = 'sse2_double' + +# Explanation of the 'properties': +# +# It is cheap to compute r^2, and the kernels require various other functions of r for +# different kinds of interaction. Depending on the needs of the kernel and the available +# processor instructions, this will be done in different ways. +# +# 'rinv' means we need 1/r, which is calculated as 1/sqrt(r^2). +# 'rinvsq' means we need 1/(r*r). This is calculated as rinv*rinv if we already did rinv, otherwise 1/r^2. +# 'r' is similarly calculated as r^2*rinv when needed +# 'table' means the interaction is tabulated, in which case we will calculate a table index before the interaction +# 'shift' means the interaction will be modified by a constant to make it zero at the cutoff. +# 'cutoff' means the interaction is set to 0.0 outside the cutoff +# + +FileHeader = \ +'/*\n' \ +' * Note: this file was generated by the Gromacs '+Arch+' kernel generator.\n' \ +' *\n' \ +' * This source code is part of\n' \ +' *\n' \ +' * G R O M A C S\n' \ +' *\n' \ +' * Copyright (c) 2001-2012, The GROMACS Development Team\n' \ +' *\n' \ +' * Gromacs is a library for molecular simulation and trajectory analysis,\n' \ +' * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for\n' \ +' * a full list of developers and information, check out http://www.gromacs.org\n' \ +' *\n' \ +' * This program is free software; you can redistribute it and/or modify it under\n' \ +' * the terms of the GNU Lesser General Public License as published by the Free\n' \ +' * Software Foundation; either version 2 of the License, or (at your option) any\n' \ +' * later version.\n' \ +' *\n' \ +' * To help fund GROMACS development, we humbly ask that you cite\n' \ +' * the papers people have written on it - you can find them on the website.\n' \ +' */\n' + +############################################### +# ELECTROSTATICS +# Interactions and flags for them +############################################### +ElectrostaticsList = { + 'None' : [], + 'Coulomb' : ['rinv','rinvsq'], + 'ReactionField' : ['rinv','rinvsq'], + 'GeneralizedBorn' : ['rinv','r'], + 'CubicSplineTable' : ['rinv','r','table'], + 'Ewald' : ['rinv','rinvsq','r'], +} + + +############################################### +# VAN DER WAALS +# Interactions and flags for them +############################################### +VdwList = { + 'None' : [], + 'LennardJones' : ['rinvsq'], +# 'Buckingham' : ['rinv','rinvsq','r'], # Disabled for sse2 to reduce number of kernels and simply the template + 'CubicSplineTable' : ['rinv','r','table'], +} + + +############################################### +# MODIFIERS +# Different ways to adjust/modify interactions to conserve energy +############################################### +ModifierList = { + 'None' : [], + 'ExactCutoff' : ['exactcutoff'], # Zero the interaction outside the cutoff, used for reaction-field-zero + 'PotentialShift' : ['shift','exactcutoff'], + 'PotentialSwitch' : ['rinv','r','switch','exactcutoff'] +} + + +############################################### +# GEOMETRY COMBINATIONS +############################################### +GeometryNameList = [ + [ 'Particle' , 'Particle' ], + [ 'Water3' , 'Particle' ], + [ 'Water3' , 'Water3' ], + [ 'Water4' , 'Particle' ], + [ 'Water4' , 'Water4' ] +] + + +############################################### +# POTENTIAL / FORCE +############################################### +VFList = [ + 'PotentialAndForce', +# 'Potential', # Not used yet + 'Force' +] + + +############################################### +# GEOMETRY PROPERTIES +############################################### +# Dictionaries with lists telling which interactions are present +# 1,2,3 means particles 1,2,3 (but not 0) have electrostatics! +GeometryElectrostatics = { + 'Particle' : [ 0 ], + 'Particle2' : [ 0 , 1 ], + 'Particle3' : [ 0 , 1 , 2 ], + 'Particle4' : [ 0 , 1 , 2 , 3 ], + 'Water3' : [ 0 , 1 , 2 ], + 'Water4' : [ 1 , 2 , 3 ] +} + +GeometryVdw = { + 'Particle' : [ 0 ], + 'Particle2' : [ 0 , 1 ], + 'Particle3' : [ 0 , 1 , 2 ], + 'Particle4' : [ 0 , 1 , 2 , 3 ], + 'Water3' : [ 0 ], + 'Water4' : [ 0 ] +} + + + + +# Dictionary to abbreviate all strings (mixed from all the lists) +Abbreviation = { + 'None' : 'None', + 'Coulomb' : 'Coul', + 'Ewald' : 'Ew', + 'ReactionField' : 'RF', + 'GeneralizedBorn' : 'GB', + 'CubicSplineTable' : 'CSTab', + 'LennardJones' : 'LJ', + 'Buckingham' : 'Bham', + 'PotentialShift' : 'Sh', + 'PotentialSwitch' : 'Sw', + 'ExactCutoff' : 'Cut', + 'PotentialAndForce' : 'VF', + 'Potential' : 'V', + 'Force' : 'F', + 'Water3' : 'W3', + 'Water4' : 'W4', + 'Particle' : 'P1', + 'Particle2' : 'P2', + 'Particle3' : 'P3', + 'Particle4' : 'P4' +} + + +############################################### +# Functions +############################################### + +# Return a string with the kernel name from current settings +def MakeKernelFileName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom): + ElecStr = 'Elec' + Abbreviation[KernelElec] + if(KernelElecMod!='None'): + ElecStr = ElecStr + Abbreviation[KernelElecMod] + VdwStr = 'Vdw' + Abbreviation[KernelVdw] + if(KernelVdwMod!='None'): + VdwStr = VdwStr + Abbreviation[KernelVdwMod] + GeomStr = 'Geom' + Abbreviation[KernelGeom[0]] + Abbreviation[KernelGeom[1]] + return 'nb_kernel_' + ElecStr + '_' + VdwStr + '_' + GeomStr + '_' + Arch + +def MakeKernelName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF): + ElecStr = 'Elec' + Abbreviation[KernelElec] + if(KernelElecMod!='None'): + ElecStr = ElecStr + Abbreviation[KernelElecMod] + VdwStr = 'Vdw' + Abbreviation[KernelVdw] + if(KernelVdwMod!='None'): + VdwStr = VdwStr + Abbreviation[KernelVdwMod] + GeomStr = 'Geom' + Abbreviation[KernelGeom[0]] + Abbreviation[KernelGeom[1]] + VFStr = Abbreviation[KernelVF] + return 'nb_kernel_' + ElecStr + '_' + VdwStr + '_' + GeomStr + '_' + VFStr + '_' + Arch + +# Return a string with a declaration to use for the kernel; +# this will be a sequence of string combinations as well as the actual function name +# Dont worry about field widths - that is just pretty-printing for the header! +def MakeKernelDecl(KernelName,KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelOther,KernelVF): + KernelStr = '\"'+KernelName+'\"' + ArchStr = '\"'+Arch+'\"' + ElecStr = '\"'+KernelElec+'\"' + ElecModStr = '\"'+KernelElecMod+'\"' + VdwStr = '\"'+KernelVdw+'\"' + VdwModStr = '\"'+KernelVdwMod+'\"' + GeomStr = '\"'+KernelGeom[0]+KernelGeom[1]+'\"' + OtherStr = '\"'+KernelOther+'\"' + VFStr = '\"'+KernelVF+'\"' + + ThisSpec = ArchStr+', '+ElecStr+', '+ElecModStr+', '+VdwStr+', '+VdwModStr+', '+GeomStr+', '+OtherStr+', '+VFStr + ThisDecl = ' { '+KernelName+', '+KernelStr+', '+ThisSpec+' }' + return ThisDecl + + +# Returns 1 if this kernel should be created, 0 if we should skip it +# This routine is not critical - it is not the end of the world if we create more kernels, +# but since the number is pretty large we save both space and compile-time by reducing it a bit. +def KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF): + + # No need for kernels without interactions + if(KernelElec=='None' and KernelVdw=='None'): + return 0 + + # No need for modifiers without interactions + if((KernelElec=='None' and KernelElecMod!='None') or (KernelVdw=='None' and KernelVdwMod!='None')): + return 0 + + # No need for LJ-only water optimization, or water optimization with implicit solvent. + if('Water' in KernelGeom[0] and (KernelElec=='None' or 'GeneralizedBorn' in KernelElec)): + return 0 + + # Non-matching table settings are pointless + if( ('Table' in KernelElec) and ('Table' in KernelVdw) and KernelElec!=KernelVdw ): + return 0 + + # Try to reduce the number of different switch/shift options to get a reasonable number of kernels + # For electrostatics, reaction-field can use 'exactcutoff', and ewald can use switch or shift. + if(KernelElecMod=='ExactCutoff' and KernelElec!='ReactionField'): + return 0 + if(KernelElecMod in ['PotentialShift','PotentialSwitch'] and KernelElec!='Ewald'): + return 0 + # For Vdw, we support switch and shift for Lennard-Jones/Buckingham + if((KernelVdwMod=='ExactCutoff') or + (KernelVdwMod in ['PotentialShift','PotentialSwitch'] and KernelVdw not in ['LennardJones','Buckingham'])): + return 0 + + # Choose either switch or shift and don't mix them... + if((KernelElecMod=='PotentialShift' and KernelVdwMod=='PotentialSwitch') or + (KernelElecMod=='PotentialSwitch' and KernelVdwMod=='PotentialShift')): + return 0 + + # Don't use a Vdw kernel with a modifier if the electrostatics one does not have one + if(KernelElec!='None' and KernelElecMod=='None' and KernelVdwMod!='None'): + return 0 + + # Don't use an electrostatics kernel with a modifier if the vdw one does not have one, + # unless the electrostatics one is reaction-field with exact cutoff. + if(KernelVdw!='None' and KernelVdwMod=='None' and KernelElecMod!='None'): + if(KernelElec=='ReactionField' and KernelVdw!='CubicSplineTable'): + return 0 + elif(KernelElec!='ReactionField'): + return 0 + + return 1 + + + +# +# The preprocessor will automatically expand the interactions for water and other +# geometries inside the kernel, but to get this right we need to setup a couple +# of defines - we do them in a separate routine to keep the main loop clean. +# +# While this routine might look a bit complex it is actually quite straightforward, +# and the best news is that you wont have to modify _anything_ for a new geometry +# as long as you correctly define its Electrostatics/Vdw geometry in the lists above! +# +def SetDefines(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF,defines): + # What is the _name_ for the i/j group geometry? + igeometry = KernelGeom[0] + jgeometry = KernelGeom[1] + # define so we can access it in the source when the preprocessor runs + defines['GEOMETRY_I'] = igeometry + defines['GEOMETRY_J'] = jgeometry + + # For the i/j groups, extract a python list of which sites have electrostatics + # For SPC/TIP3p this will be [1,1,1], while TIP4p (no elec on first site) will be [0,1,1,1] + ielec = GeometryElectrostatics[igeometry] + jelec = GeometryElectrostatics[jgeometry] + # Zero out the corresponding lists in case we dont do Elec + if(KernelElec=='None'): + ielec = [] + jelec = [] + + # Extract similar interaction lists for Vdw interactions (example for SPC: [1,0,0]) + iVdw = GeometryVdw[igeometry] + jVdw = GeometryVdw[jgeometry] + + # Zero out the corresponding lists in case we dont do Vdw + if(KernelVdw=='None'): + iVdw = [] + jVdw = [] + + # iany[] and jany[] contains lists of the particles actually used (for interactions) in this kernel + iany = list(set(ielec+iVdw)) # convert to+from set to make elements unique + jany = list(set(jelec+jVdw)) + + defines['PARTICLES_ELEC_I'] = ielec + defines['PARTICLES_ELEC_J'] = jelec + defines['PARTICLES_VDW_I'] = iVdw + defines['PARTICLES_VDW_J'] = jVdw + defines['PARTICLES_I'] = iany + defines['PARTICLES_J'] = jany + + # elecij,Vdwij are sets with pairs of particles for which the corresponding interaction is done + # (and anyij again corresponds to either electrostatics or Vdw) + elecij = [] + Vdwij = [] + anyij = [] + + for i in ielec: + for j in jelec: + elecij.append([i,j]) + + for i in iVdw: + for j in jVdw: + Vdwij.append([i,j]) + + for i in iany: + for j in jany: + if [i,j] in elecij or [i,j] in Vdwij: + anyij.append([i,j]) + + defines['PAIRS_IJ'] = anyij + + # Make an 2d list-of-distance-properties-to-calculate for i,j + ni = max(iany)+1 + nj = max(jany)+1 + # Each element properties[i][j] is an empty list + properties = [ [ [] for j in range(0,nj) ] for i in range (0,ni) ] + # Add properties to each set + for i in range(0,ni): + for j in range(0,nj): + if [i,j] in elecij: + properties[i][j] = properties[i][j] + ['electrostatics'] + ElectrostaticsList[KernelElec] + ModifierList[KernelElecMod] + if [i,j] in Vdwij: + properties[i][j] = properties[i][j] + ['vdw'] + VdwList[KernelVdw] + ModifierList[KernelVdwMod] + # Add rinv if we need r + if 'r' in properties[i][j]: + properties[i][j] = properties[i][j] + ['rinv'] + # Add rsq if we need rinv or rinsq + if 'rinv' in properties[i][j] or 'rinvsq' in properties[i][j]: + properties[i][j] = properties[i][j] + ['rsq'] + + defines['INTERACTION_FLAGS'] = properties + + + +def PrintStatistics(ratio): + ratio = 100.0*ratio + print '\rGenerating %s nonbonded kernels... %5.1f%%' % (Arch,ratio), + sys.stdout.flush() + + + +defines = {} +kerneldecl = [] + +cnt = 0.0 +nelec = len(ElectrostaticsList) +nVdw = len(VdwList) +nmod = len(ModifierList) +ngeom = len(GeometryNameList) + +ntot = nelec*nmod*nVdw*nmod*ngeom + +numKernels = 0 + +fpdecl = open('nb_kernel_' + Arch + '.c','w') +fpdecl.write( FileHeader ) +fpdecl.write( '#ifndef nb_kernel_' + Arch + '_h\n' ) +fpdecl.write( '#define nb_kernel_' + Arch + '_h\n\n' ) +fpdecl.write( '#include "../nb_kernel.h"\n\n' ) + +for KernelElec in ElectrostaticsList: + defines['KERNEL_ELEC'] = KernelElec + + for KernelElecMod in ModifierList: + defines['KERNEL_MOD_ELEC'] = KernelElecMod + + for KernelVdw in VdwList: + defines['KERNEL_VDW'] = KernelVdw + + for KernelVdwMod in ModifierList: + defines['KERNEL_MOD_VDW'] = KernelVdwMod + + for KernelGeom in GeometryNameList: + + cnt += 1 + KernelFilename = MakeKernelFileName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom) + '.c' + fpkernel = open(KernelFilename,'w') + defines['INCLUDE_HEADER'] = 1 # Include header first time in new file + DoHeader = 1 + + for KernelVF in VFList: + + KernelName = MakeKernelName(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF) + + defines['KERNEL_NAME'] = KernelName + defines['KERNEL_VF'] = KernelVF + + # Check if this is a valid/sane/usable combination + if not KeepKernel(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF): + continue; + + # The overall kernel settings determine what the _kernel_ calculates, but for the water + # kernels this does not mean that every pairwise interaction has e.g. Vdw interactions. + # This routine sets defines of what to calculate for each pair of particles in those cases. + SetDefines(KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelVF,defines) + + if(DoHeader==1): + fpkernel.write( FileHeader ) + + gmxpreprocess('nb_kernel_template_' + Arch + '.pre', KernelName+'.tmp' , defines, force=1,contentType='C') + numKernels = numKernels + 1 + + defines['INCLUDE_HEADER'] = 0 # Header has been included once now + DoHeader=0 + + # Append temp file contents to the common kernelfile + fptmp = open(KernelName+'.tmp','r') + fpkernel.writelines(fptmp.readlines()) + fptmp.close() + os.remove(KernelName+'.tmp') + + # Add a declaration for this kernel + fpdecl.write('nb_kernel_t ' + KernelName + ';\n'); + + # Add declaration to the buffer + KernelOther='' + kerneldecl.append(MakeKernelDecl(KernelName,KernelElec,KernelElecMod,KernelVdw,KernelVdwMod,KernelGeom,KernelOther,KernelVF)) + + filesize = fpkernel.tell() + fpkernel.close() + if(filesize==0): + os.remove(KernelFilename) + + PrintStatistics(cnt/ntot) + pass + pass + pass + pass +pass + +# Write out the list of settings and corresponding kernels to the declaration file +fpdecl.write( '\n\n' ) +fpdecl.write( 'nb_kernel_info_t\n' ) +fpdecl.write( 'kernellist_'+Arch+'[] =\n' ) +fpdecl.write( '{\n' ) +for decl in kerneldecl[0:-1]: + fpdecl.write( decl + ',\n' ) +fpdecl.write( kerneldecl[-1] + '\n' ) +fpdecl.write( '};\n\n' ) +fpdecl.write( 'int\n' ) +fpdecl.write( 'kernellist_'+Arch+'_size = sizeof(kernellist_'+Arch+')/sizeof(kernellist_'+Arch+'[0]);\n\n') +fpdecl.write( '#endif\n') +fpdecl.close() diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel400_sse2_double.c b/src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel400_sse2_double.c deleted file mode 100644 index 7954df51ae..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel400_sse2_double.c +++ /dev/null @@ -1,329 +0,0 @@ -/* - * Copyright (c) Erik Lindahl, David van der Spoel 2003 - * - * This file is generated automatically at compile time - * by the program mknb in the Gromacs distribution. - * - * Options used when generation this file: - * Language: c - * Precision: double - * Threads: no - * Software invsqrt: yes - * Prefetch forces: no - * Comments: no - */ -#ifdef HAVE_CONFIG_H -#include -#endif -#include -#include - -#include - -#include - -/* get gmx_gbdata_t */ -#include "../nb_kernel.h" - - -void nb_kernel400_sse2_double(int * p_nri, - int * iinr, - int * jindex, - int * jjnr, - int * shift, - double * shiftvec, - double * fshift, - int * gid, - double * pos, - double * faction, - double * charge, - double * p_facel, - double * p_krf, - double * p_crf, - double * vc, - int * type, - int * p_ntype, - double * vdwparam, - double * vvdw, - double * p_tabscale, - double * VFtab, - double * invsqrta, - double * dvda, - double * p_gbtabscale, - double * GBtab, - int * p_nthreads, - int * count, - void * mtx, - int * outeriter, - int * inneriter, - double * work) -{ - int nri,nthreads; - int n,ii,is3,ii3,k,nj0,nj1,ggid; - double shX,shY,shZ; - int jnrA,jnrB; - int j3A,j3B; - gmx_gbdata_t *gbdata; - double * gpol; - - __m128d iq,qq,jq,isai; - __m128d ix,iy,iz; - __m128d jx,jy,jz; - __m128d dx,dy,dz; - __m128d vctot,vgbtot,dvdasum,gbfactor; - __m128d fix,fiy,fiz,tx,ty,tz,rsq; - __m128d rinv,isaj,isaprod; - __m128d vcoul,fscal,gbscale; - __m128d rinvsq,r,rtab; - __m128d eps,Y,F,G,H; - __m128d vgb,fijGB,dvdatmp; - __m128d facel,gbtabscale,dvdaj; - __m128i n0, nnn; - - const __m128d neg = _mm_set1_pd(-1.0); - const __m128d zero = _mm_set1_pd(0.0); - const __m128d minushalf = _mm_set1_pd(-0.5); - const __m128d two = _mm_set1_pd(2.0); - - gbdata = (gmx_gbdata_t *)work; - gpol = gbdata->gpol; - - nri = *p_nri; - - gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent))); - gbtabscale = _mm_load1_pd(p_gbtabscale); - facel = _mm_load1_pd(p_facel); - - nj1 = 0; - jnrA = jnrB = 0; - j3A = j3B = 0; - jx = _mm_setzero_pd(); - jy = _mm_setzero_pd(); - jz = _mm_setzero_pd(); - - for(n=0;nCoulomb interaction: Generalized-Born
- * VdW interaction: No
- * Water optimization: No
- * Forces calculated: Yes
- * - * \note All level1 and level2 nonbonded kernels use the same - * call sequence. Parameters are documented in nb_kernel.h - */ -void -nb_kernel400_sse2_double (int * nri, int iinr[], int jindex[], - int jjnr[], int shift[], double shiftvec[], - double fshift[], int gid[], double pos[], - double faction[], double charge[], double * facel, - double * krf, double * crf, double Vc[], - int type[], int * ntype, double vdwparam[], - double Vvdw[], double * tabscale, double VFtab[], - double invsqrta[], double dvda[], double * gbtabscale, - double GBtab[], int * nthreads, int * count, - void * mtx, int * outeriter,int * inneriter, - double * work); - - - - -/*! \brief Nonbonded kernel 400 without forces, optimized for sse. - * - * \internal - * - * Coulomb interaction: Generalized-Born
- * VdW interaction: No
- * Water optimization: No
- * Forces calculated: No
- * - * \note All level1 and level2 nonbonded kernels use the same - * call sequence. Parameters are documented in nb_kernel.h - */ -void -nb_kernel400nf_sse2_double(int * nri, int iinr[], int jindex[], - int jjnr[], int shift[], double shiftvec[], - double fshift[], int gid[], double pos[], - double faction[], double charge[], double * facel, - double * krf, double * crf, double Vc[], - int type[], int * ntype, double vdwparam[], - double Vvdw[], double * tabscale, double VFtab[], - double invsqrta[], double dvda[], double * gbtabscale, - double GBtab[], int * nthreads, int * count, - void * mtx, int * outeriter,int * inneriter, - double * work); - - - -#ifdef __cplusplus -} -#endif - - -#endif /* _NB_KERNEL400_SSE2_DOUBLE_H_ */ diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel410_sse2_double.c b/src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel410_sse2_double.c deleted file mode 100644 index 780c941b54..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel410_sse2_double.c +++ /dev/null @@ -1,377 +0,0 @@ -/* - * Copyright (c) Erik Lindahl, David van der Spoel 2003 - * - * This file is generated automatically at compile time - * by the program mknb in the Gromacs distribution. - * - * Options used when generation this file: - * Language: c - * Precision: double - * Threads: no - * Software invsqrt: yes - * Prefetch forces: no - * Comments: no - */ -#ifdef HAVE_CONFIG_H -#include -#endif -#include -#include - -#include - -#include - -/* get gmx_gbdata_t */ -#include "../nb_kernel.h" - - - -void nb_kernel410_sse2_double(int * p_nri, - int * iinr, - int * jindex, - int * jjnr, - int * shift, - double * shiftvec, - double * fshift, - int * gid, - double * pos, - double * faction, - double * charge, - double * p_facel, - double * p_krf, - double * p_crf, - double * vc, - int * type, - int * p_ntype, - double * vdwparam, - double * vvdw, - double * p_tabscale, - double * VFtab, - double * invsqrta, - double * dvda, - double * p_gbtabscale, - double * GBtab, - int * p_nthreads, - int * count, - void * mtx, - int * outeriter, - int * inneriter, - double * work) -{ - int nri,ntype,nthreads; - int n,ii,is3,ii3,k,nj0,nj1,ggid; - double shX,shY,shZ; - int offset,nti; - int jnrA,jnrB; - int j3A,j3B; - int tjA,tjB; - gmx_gbdata_t *gbdata; - double * gpol; - - __m128d iq,qq,jq,isai; - __m128d ix,iy,iz; - __m128d jx,jy,jz; - __m128d dx,dy,dz; - __m128d vctot,vvdwtot,vgbtot,dvdasum,gbfactor; - __m128d fix,fiy,fiz,tx,ty,tz,rsq; - __m128d rinv,isaj,isaprod; - __m128d vcoul,fscal,gbscale,c6,c12; - __m128d rinvsq,r,rtab; - __m128d eps,Y,F,G,H; - __m128d vgb,fijGB,dvdatmp; - __m128d rinvsix,vvdw6,vvdw12; - __m128d facel,gbtabscale,dvdaj; - __m128i n0, nnn; - - const __m128d neg = _mm_set1_pd(-1.0); - const __m128d zero = _mm_set1_pd(0.0); - const __m128d minushalf = _mm_set1_pd(-0.5); - const __m128d two = _mm_set1_pd(2.0); - const __m128d six = _mm_set1_pd(6.0); - const __m128d twelve = _mm_set1_pd(12.0); - - gbdata = (gmx_gbdata_t *)work; - gpol = gbdata->gpol; - - nri = *p_nri; - ntype = *p_ntype; - - gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent))); - gbtabscale = _mm_load1_pd(p_gbtabscale); - facel = _mm_load1_pd(p_facel); - - nj1 = 0; - jnrA = jnrB = 0; - j3A = j3B = 0; - jx = _mm_setzero_pd(); - jy = _mm_setzero_pd(); - jz = _mm_setzero_pd(); - c6 = _mm_setzero_pd(); - c12 = _mm_setzero_pd(); - - for(n=0;nCoulomb interaction: Generalized-Born
- * VdW interaction: Lennard-Jones
- * Water optimization: No
- * Forces calculated: Yes
- * - * \note All level1 and level2 nonbonded kernels use the same - * call sequence. Parameters are documented in nb_kernel.h - */ -void -nb_kernel410_sse2_double (int * nri, int iinr[], int jindex[], - int jjnr[], int shift[], double shiftvec[], - double fshift[], int gid[], double pos[], - double faction[], double charge[], double * facel, - double * krf, double * crf, double Vc[], - int type[], int * ntype, double vdwparam[], - double Vvdw[], double * tabscale, double VFtab[], - double invsqrta[], double dvda[], double * gbtabscale, - double GBtab[], int * nthreads, int * count, - void * mtx, int * outeriter,int * inneriter, - double * work); - - - -/*! \brief Nonbonded kernel 410 without forces, optimized for sse. - * - * \internal - * - * Coulomb interaction: Generalized-Born
- * VdW interaction: Lennard-Jones
- * Water optimization: No
- * Forces calculated: No
- * - * \note All level1 and level2 nonbonded kernels use the same - * call sequence. Parameters are documented in nb_kernel.h - */ -void -nb_kernel410nf_sse2_double(int * nri, int iinr[], int jindex[], - int jjnr[], int shift[], double shiftvec[], - double fshift[], int gid[], double pos[], - double faction[], double charge[], double * facel, - double * krf, double * crf, double Vc[], - int type[], int * ntype, double vdwparam[], - double Vvdw[], double * tabscale, double VFtab[], - double invsqrta[], double dvda[], double * gbtabscale, - double GBtab[], int * nthreads, int * count, - void * mtx, int * outeriter,int * inneriter, - double * work); - - -#ifdef __cplusplus -} -#endif - - - -#endif /* _NB_KERNEL410_SSE2_DOUBLE_H_ */ diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel430_sse2_double.c b/src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel430_sse2_double.c deleted file mode 100644 index 85c0832aef..0000000000 --- a/src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel430_sse2_double.c +++ /dev/null @@ -1,473 +0,0 @@ -/* - * Copyright (c) Erik Lindahl, David van der Spoel 2003 - * - * This file is generated automatically at compile time - * by the program mknb in the Gromacs distribution. - * - * Options used when generation this file: - * Language: c - * Precision: double - * Threads: no - * Software invsqrt: yes - * Prefetch forces: no - * Comments: no - */ -#ifdef HAVE_CONFIG_H -#include -#endif -#include -#include - -#include - -#include - -/* get gmx_gbdata_t */ -#include "../nb_kernel.h" - - -void nb_kernel430_sse2_double(int * p_nri, - int * iinr, - int * jindex, - int * jjnr, - int * shift, - double * shiftvec, - double * fshift, - int * gid, - double * pos, - double * faction, - double * charge, - double * p_facel, - double * p_krf, - double * p_crf, - double * vc, - int * type, - int * p_ntype, - double * vdwparam, - double * vvdw, - double * p_tabscale, - double * VFtab, - double * invsqrta, - double * dvda, - double * p_gbtabscale, - double * GBtab, - int * p_nthreads, - int * count, - void * mtx, - int * outeriter, - int * inneriter, - double * work) -{ - int nri,ntype,nthreads; - int n,ii,is3,ii3,k,nj0,nj1,ggid; - double shX,shY,shZ; - int offset,nti; - int jnrA,jnrB; - int j3A,j3B; - int tjA,tjB; - gmx_gbdata_t *gbdata; - double * gpol; - - __m128d iq,qq,jq,isai; - __m128d ix,iy,iz; - __m128d jx,jy,jz; - __m128d dx,dy,dz; - __m128d vctot,vvdwtot,vgbtot,dvdasum,gbfactor; - __m128d fix,fiy,fiz,tx,ty,tz,rsq; - __m128d rinv,isaj,isaprod; - __m128d vcoul,fscal,gbscale,c6,c12; - __m128d rinvsq,r,rtab; - __m128d eps,Y,F,G,H; - __m128d VV,FF,Fp; - __m128d vgb,fijGB,dvdatmp; - __m128d rinvsix,vvdw6,vvdw12,vvdwtmp; - __m128d facel,gbtabscale,dvdaj; - __m128d fijD,fijR; - __m128d xmm1,tabscale,eps2; - __m128i n0, nnn; - - - const __m128d neg = _mm_set1_pd(-1.0); - const __m128d zero = _mm_set1_pd(0.0); - const __m128d minushalf = _mm_set1_pd(-0.5); - const __m128d two = _mm_set1_pd(2.0); - - gbdata = (gmx_gbdata_t *)work; - gpol = gbdata->gpol; - - nri = *p_nri; - ntype = *p_ntype; - - gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent))); - gbtabscale = _mm_load1_pd(p_gbtabscale); - facel = _mm_load1_pd(p_facel); - tabscale = _mm_load1_pd(p_tabscale); - - nj1 = 0; - jnrA = jnrB = 0; - j3A = j3B = 0; - jx = _mm_setzero_pd(); - jy = _mm_setzero_pd(); - jz = _mm_setzero_pd(); - c6 = _mm_setzero_pd(); - c12 = _mm_setzero_pd(); - - for(n=0;nCoulomb interaction: Generalized-Born
- * VdW interaction: Tabulated
- * Water optimization: No
- * Forces calculated: Yes
- * - * \note All level1 and level2 nonbonded kernels use the same - * call sequence. Parameters are documented in nb_kernel.h - */ -void -nb_kernel430_sse2_double (int * nri, int iinr[], int jindex[], - int jjnr[], int shift[], double shiftvec[], - double fshift[], int gid[], double pos[], - double faction[], double charge[], double * facel, - double * krf, double * crf, double Vc[], - int type[], int * ntype, double vdwparam[], - double Vvdw[], double * tabscale, double VFtab[], - double invsqrta[], double dvda[], double * gbtabscale, - double GBtab[], int * nthreads, int * count, - void * mtx, int * outeriter,int * inneriter, - double * work); - - - - -/*! \brief Nonbonded kernel 430 without forces, optimized for sse. - * - * \internal - * - * Coulomb interaction: Generalized-Born
- * VdW interaction: Tabulated
- * Water optimization: No
- * Forces calculated: No
- * - * \note All level1 and level2 nonbonded kernels use the same - * call sequence. Parameters are documented in nb_kernel.h - */ -void -nb_kernel430nf_sse2_double(int * nri, int iinr[], int jindex[], - int jjnr[], int shift[], double shiftvec[], - double fshift[], int gid[], double pos[], - double faction[], double charge[], double * facel, - double * krf, double * crf, double Vc[], - int type[], int * ntype, double vdwparam[], - double Vvdw[], double * tabscale, double VFtab[], - double invsqrta[], double dvda[], double * gbtabscale, - double GBtab[], int * nthreads, int * count, - void * mtx, int * outeriter,int * inneriter, - double * work); - - -#ifdef __cplusplus -} -#endif - - - -#endif /* _NB_KERNEL430_SSE2_DOUBLE_H_ */ diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_sse2_double.c b/src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_sse2_double.c new file mode 100644 index 0000000000..2d5eb1e3b8 --- /dev/null +++ b/src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_sse2_double.c @@ -0,0 +1,712 @@ +/* + * Note: this file was generated by the Gromacs sse2_double kernel generator. + * + * This source code is part of + * + * G R O M A C S + * + * Copyright (c) 2001-2012, The GROMACS Development Team + * + * Gromacs is a library for molecular simulation and trajectory analysis, + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for + * a full list of developers and information, check out http://www.gromacs.org + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) any + * later version. + * + * To help fund GROMACS development, we humbly ask that you cite + * the papers people have written on it - you can find them on the website. + */ +#ifdef HAVE_CONFIG_H +#include +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: CubicSplineTable + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 9 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*73); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: CubicSplineTable + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: CubicSplineTable + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*162); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: CubicSplineTable + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: CubicSplineTable + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*417); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: CubicSplineTable + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: CubicSplineTable + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*188); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: CubicSplineTable + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: CubicSplineTable + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*446); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: CubicSplineTable + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 9 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*56); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: LennardJones + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*145); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: LennardJones + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: LennardJones + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*400); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: LennardJones + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: LennardJones + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*164); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: LennardJones + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: LennardJones + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*422); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: LennardJones + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: None + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCSTab_VdwNone_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 8 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*43); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: None + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: None + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*132); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: None + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: None + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + qq00 = _mm_mul_pd(iq0,jq0); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*387); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: None + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + qq00 = _mm_mul_pd(iq0,jq0); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: None + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*132); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: None + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: None + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*387); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_double + * Electrostatics interaction: CubicSplineTable + * VdW interaction: None + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: CubicSplineTable + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 9 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*63); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: CubicSplineTable + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: CubicSplineTable + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*122); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: CubicSplineTable + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: CubicSplineTable + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*287); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: CubicSplineTable + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: CubicSplineTable + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*143); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: CubicSplineTable + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: CubicSplineTable + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*311); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: CubicSplineTable + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCoul_VdwLJ_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 9 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*40); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: LennardJones + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCoul_VdwLJ_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*99); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: LennardJones + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: LennardJones + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*264); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: LennardJones + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: LennardJones + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*119); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: LennardJones + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: LennardJones + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*287); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: LennardJones + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: None + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCoul_VdwNone_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 8 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*28); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: None + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: None + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCoul_VdwNone_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*87); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: None + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: None + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCoul_VdwNone_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + qq00 = _mm_mul_pd(iq0,jq0); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*252); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: None + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + qq00 = _mm_mul_pd(iq0,jq0); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: None + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*87); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: None + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: None + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecCoul_VdwNone_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*252); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_double + * Electrostatics interaction: Coulomb + * VdW interaction: None + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 9 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*64); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*159); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*432); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*182); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*458); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSh_VdwNone_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 8 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*46); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSh_VdwNone_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*141); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + qq00 = _mm_mul_pd(iq0,jq0); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*414); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + qq00 = _mm_mul_pd(iq0,jq0); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSh_VdwNone_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*141); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*414); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 9 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*83); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*216); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*603); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*257); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*647); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSw_VdwNone_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 8 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*65); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*198); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + qq00 = _mm_mul_pd(iq0,jq0); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*585); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + qq00 = _mm_mul_pd(iq0,jq0); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*198); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*585); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: CubicSplineTable + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEw_VdwCSTab_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 9 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*75); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: CubicSplineTable + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: CubicSplineTable + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*160); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: CubicSplineTable + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: CubicSplineTable + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*403); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: CubicSplineTable + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: CubicSplineTable + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*182); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: CubicSplineTable + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: CubicSplineTable + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*428); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: CubicSplineTable + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEw_VdwLJ_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 9 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*53); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*138); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*381); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*158); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*404); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: LennardJones + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEw_VdwNone_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 8 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*41); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEw_VdwNone_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*126); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + qq00 = _mm_mul_pd(iq0,jq0); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*369); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + qq00 = _mm_mul_pd(iq0,jq0); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEw_VdwNone_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*126); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*369); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_double + * Electrostatics interaction: Ewald + * VdW interaction: None + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_double + * Electrostatics interaction: GeneralizedBorn + * VdW interaction: CubicSplineTable + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i gbitab; + __m128d vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp; + __m128d minushalf = _mm_set1_pd(-0.5); + real *invsqrta,*dvda,*gbtab; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + invsqrta = fr->invsqrta; + dvda = fr->dvda; + gbtabscale = _mm_set1_pd(fr->gbtab.scale); + gbtab = fr->gbtab.data; + gbinvepsdiff = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0)); + gmx_mm_update_1pot_pd(dvdasum,dvda+inr); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 10 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*92); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_double + * Electrostatics interaction: GeneralizedBorn + * VdW interaction: CubicSplineTable + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i gbitab; + __m128d vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp; + __m128d minushalf = _mm_set1_pd(-0.5); + real *invsqrta,*dvda,*gbtab; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + invsqrta = fr->invsqrta; + dvda = fr->dvda; + gbtabscale = _mm_set1_pd(fr->gbtab.scale); + gbtab = fr->gbtab.data; + gbinvepsdiff = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_double + * Electrostatics interaction: GeneralizedBorn + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecGB_VdwLJ_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i gbitab; + __m128d vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp; + __m128d minushalf = _mm_set1_pd(-0.5); + real *invsqrta,*dvda,*gbtab; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + invsqrta = fr->invsqrta; + dvda = fr->dvda; + gbtabscale = _mm_set1_pd(fr->gbtab.scale); + gbtab = fr->gbtab.data; + gbinvepsdiff = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0)); + gmx_mm_update_1pot_pd(dvdasum,dvda+inr); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 10 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*71); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_double + * Electrostatics interaction: GeneralizedBorn + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i gbitab; + __m128d vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp; + __m128d minushalf = _mm_set1_pd(-0.5); + real *invsqrta,*dvda,*gbtab; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + invsqrta = fr->invsqrta; + dvda = fr->dvda; + gbtabscale = _mm_set1_pd(fr->gbtab.scale); + gbtab = fr->gbtab.data; + gbinvepsdiff = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_double + * Electrostatics interaction: GeneralizedBorn + * VdW interaction: None + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecGB_VdwNone_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i gbitab; + __m128d vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp; + __m128d minushalf = _mm_set1_pd(-0.5); + real *invsqrta,*dvda,*gbtab; + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + invsqrta = fr->invsqrta; + dvda = fr->dvda; + gbtabscale = _mm_set1_pd(fr->gbtab.scale); + gbtab = fr->gbtab.data; + gbinvepsdiff = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid); + dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai0,isai0)); + gmx_mm_update_1pot_pd(dvdasum,dvda+inr); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 9 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*9 + inneriter*58); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_double + * Electrostatics interaction: GeneralizedBorn + * VdW interaction: None + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128i gbitab; + __m128d vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp; + __m128d minushalf = _mm_set1_pd(-0.5); + real *invsqrta,*dvda,*gbtab; + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + + invsqrta = fr->invsqrta; + dvda = fr->dvda; + gbtabscale = _mm_set1_pd(fr->gbtab.scale); + gbtab = fr->gbtab.data; + gbinvepsdiff = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_double + * Electrostatics interaction: None + * VdW interaction: CubicSplineTable + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecNone_VdwCSTab_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 7 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*56); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_double + * Electrostatics interaction: None + * VdW interaction: CubicSplineTable + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_double + * Electrostatics interaction: None + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecNone_VdwLJSh_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + rcutoff_scalar = fr->rvdw; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 7 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*41); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_double + * Electrostatics interaction: None + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + rcutoff_scalar = fr->rvdw; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_double + * Electrostatics interaction: None + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecNone_VdwLJSw_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + rcutoff_scalar = fr->rvdw; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rvdw_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 7 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*59); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_double + * Electrostatics interaction: None + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + rcutoff_scalar = fr->rvdw; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rvdw_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_double + * Electrostatics interaction: None + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecNone_VdwLJ_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 7 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*32); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_double + * Electrostatics interaction: None + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 9 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*72); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*147); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*360); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*167); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*383); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 9 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*54); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*129); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*342); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*152); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*368); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rvdw_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 9 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*70); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rvdw_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rvdw_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*145); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rvdw_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rvdw_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*358); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rvdw_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rvdw_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*170); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rvdw_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rvdw_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*386); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + rswitch_scalar = fr->rvdw_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwNone_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 8 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*36); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwNone_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*111); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwNone_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + qq00 = _mm_mul_pd(iq0,jq0); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*324); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + qq00 = _mm_mul_pd(iq0,jq0); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*111); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRFCut_VdwNone_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*324); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRF_VdwCSTab_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 9 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*67); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRF_VdwCSTab_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*134); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*323); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*155); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*347); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: CubicSplineTable + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRF_VdwLJ_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 9 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*44); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*111); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 20 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*300); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + vdwjidx0A = 2*vdwtype[inr+0]; + qq00 = _mm_mul_pd(iq0,jq0); + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*131); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 26 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*323); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: LennardJones + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + vdwioffset0 = 2*nvdwtype*vdwtype[inr+0]; + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + vdwjidx0A = 2*vdwtype[inr+0]; + c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]); + c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Particle-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRF_VdwNone_GeomP1P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 8 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*32); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Particle-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Water3-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*99); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Water3-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Water3-Water3 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + qq00 = _mm_mul_pd(iq0,jq0); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*288); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Water3-Water3 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset0; + __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; + __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01; + __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0])); + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + + jq0 = _mm_set1_pd(charge[inr+0]); + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + qq00 = _mm_mul_pd(iq0,jq0); + qq01 = _mm_mul_pd(iq0,jq1); + qq02 = _mm_mul_pd(iq0,jq2); + qq10 = _mm_mul_pd(iq1,jq0); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq20 = _mm_mul_pd(iq2,jq0); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Water4-Particle + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRF_VdwNone_GeomW4P1_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*99); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Water4-Particle + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx0A,vdwjidx0B; + __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; + __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; + __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; + __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" + +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Water4-Water4 + * Calculate force/pot: PotentialAndForce + */ +void +nb_kernel_ElecRF_VdwNone_GeomW4W4_VF_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses 19 flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*288); +} +/* + * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_double + * Electrostatics interaction: ReactionField + * VdW interaction: None + * Geometry: Water4-Water4 + * Calculate force/pot: Force + */ +void +nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_double + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + int vdwioffset1; + __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; + int vdwioffset2; + __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; + int vdwioffset3; + __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; + int vdwjidx1A,vdwjidx1B; + __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1; + int vdwjidx2A,vdwjidx2B; + __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2; + int vdwjidx3A,vdwjidx3B; + __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3; + __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11; + __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12; + __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13; + __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21; + __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22; + __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23; + __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31; + __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32; + __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33; + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1])); + iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2])); + iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3])); + + jq1 = _mm_set1_pd(charge[inr+1]); + jq2 = _mm_set1_pd(charge[inr+2]); + jq3 = _mm_set1_pd(charge[inr+3]); + qq11 = _mm_mul_pd(iq1,jq1); + qq12 = _mm_mul_pd(iq1,jq2); + qq13 = _mm_mul_pd(iq1,jq3); + qq21 = _mm_mul_pd(iq2,jq1); + qq22 = _mm_mul_pd(iq2,jq2); + qq23 = _mm_mul_pd(iq2,jq3); + qq31 = _mm_mul_pd(iq3,jq1); + qq32 = _mm_mul_pd(iq3,jq2); + qq33 = _mm_mul_pd(iq3,jq3); + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidx -#endif - -/* Must come directly after config.h */ -#ifdef GMX_THREAD_SHM_FDECOMP -#include -#endif - -#include -#include - -#include "nb_kernel_sse2_double.h" - -/* Include double precision SSE intrinsics kernel headers in local directory */ -#include "nb_kernel400_sse2_double.h" -#include "nb_kernel410_sse2_double.h" -#include "nb_kernel430_sse2_double.h" - -#include -#include - -#include "../nb_kernel.h" -#include "nb_kernel_sse2_double.h" - -static nb_kernel_t * -kernellist_sse2_double[eNR_NBKERNEL_NR] = -{nb_kernel400_sse2_double, - nb_kernel410_sse2_double, - nb_kernel430_sse2_double -}; - - -/* Return 0 if SSE support is present, or - * non-zero on failure. - */ -int -nb_kernel_sse2_double_test(FILE * log) -{ - unsigned int level; - unsigned int _eax,_ebx,_ecx,_edx; - int status; - int CPUInfo[4]; - - if(NULL != log) - { - fprintf(log,"Checking CPU SSE2 support... "); - } - - level = 1; -#ifdef _MSC_VER - __cpuid(CPUInfo,1); - - _eax=CPUInfo[0]; - _ebx=CPUInfo[1]; - _ecx=CPUInfo[2]; - _edx=CPUInfo[3]; - -#elif defined(__x86_64__) - /* GCC 64-bit inline asm */ - __asm__ ("push %%rbx\n\tcpuid\n\tpop %%rbx\n" \ - : "=a" (_eax), "=S" (_ebx), "=c" (_ecx), "=d" (_edx) \ - : "0" (level)); -#elif defined(__i386__) - __asm__ ("push %%ebx\n\tcpuid\n\tpop %%ebx\n" \ - : "=a" (_eax), "=S" (_ebx), "=c" (_ecx), "=d" (_edx) \ - : "0" (level)); -#else - if(NULL != log) - { - fprintf(log,"Don't know how to call cpuid() on this system!\n"); - } - _eax=_ebx=_ecx=_edx=0; -#endif - - /* Features: - * - * SSE Bit 25 of edx should be set - * SSE2 Bit 26 of edx should be set - * SSE3 Bit 0 of ecx should be set - * SSE4.1 Bit 19 of ecx should be set - */ - status = (_edx & (1 << 26)) != 0; - - if(NULL != log) - { - fprintf(log,"%s present.", (status==0) ? "not" : ""); - } - - /* Return SSE2 status */ - return status; -} - - - - -void -nb_kernel_setup_sse2_double(FILE *log,nb_kernel_t **list) -{ - int i; - nb_kernel_t *p; - - if(nb_kernel_sse2_double_test(log) == 0) - { - return; - } - - for(i=0;i - -#include - -#include "../nb_kernel.h" - -#ifdef __cplusplus -extern "C" { -#endif -#if 0 -} -#endif - - -void -nb_kernel_setup_sse2_double(FILE *log,nb_kernel_t **list); - -#ifdef __cplusplus -} -#endif - -#endif /* _NB_KERNEL_SSE2_DOUBLE_H_ */ +/* + * Note: this file was generated by the Gromacs c kernel generator. + * + * This source code is part of + * + * G R O M A C S + * + * Copyright (c) 2001-2012, The GROMACS Development Team + * + * Gromacs is a library for molecular simulation and trajectory analysis, + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for + * a full list of developers and information, check out http://www.gromacs.org + * + * This program is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) any + * later version. + * + * To help fund GROMACS development, we humbly ask that you cite + * the papers people have written on it - you can find them on the website. + */ +#ifndef nb_kernel_sse2_double_h +#define nb_kernel_sse2_double_h + +#include "../nb_kernel.h" + + +/* List of kernels for this architecture with metadata about them */ +extern nb_kernel_info_t +kernellist_sse2_double[]; + +/* Length of kernellist_c */ +extern int +kernellist_sse2_double_size; + +#endif diff --git a/src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_template_sse2_double.pre b/src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_template_sse2_double.pre new file mode 100644 index 0000000000..4bf5952cba --- /dev/null +++ b/src/gmxlib/nonbonded/nb_kernel_sse2_double/nb_kernel_template_sse2_double.pre @@ -0,0 +1,1053 @@ +/* #if 0 */ +#error This file must be processed with the Gromacs pre-preprocessor +/* #endif */ +/* #if INCLUDE_HEADER */ +#ifdef HAVE_CONFIG_H +#include +#endif + +#include + +#include "../nb_kernel.h" +#include "types/simple.h" +#include "vec.h" +#include "nrnb.h" + +#include "gmx_math_x86_sse2_double.h" +#include "kernelutil_x86_sse2_double.h" +/* #endif */ + +/* ## List of variables set by the generating script: */ +/* ## */ +/* ## Setttings that apply to the entire kernel: */ +/* ## KERNEL_ELEC: String, choice for electrostatic interactions */ +/* ## KERNEL_VDW: String, choice for van der Waals interactions */ +/* ## KERNEL_NAME: String, name of this kernel */ +/* ## KERNEL_VF: String telling if we calculate potential, force, or both */ +/* ## GEOMETRY_I/GEOMETRY_J: String, name of each geometry, e.g. 'Water3' or '1Particle' */ +/* ## */ +/* ## Setttings that apply to particles in the outer (I) or inner (J) loops: */ +/* ## PARTICLES_I[]/ Arrays with lists of i/j particles to use in kernel. It is */ +/* ## PARTICLES_J[]: just [0] for particle geometry, but can be longer for water */ +/* ## PARTICLES_ELEC_I[]/ Arrays with lists of i/j particle that have electrostatics */ +/* ## PARTICLES_ELEC_J[]: interactions that should be calculated in this kernel. */ +/* ## PARTICLES_VDW_I[]/ Arrays with the list of i/j particle that have VdW */ +/* ## PARTICLES_VDW_J[]: interactions that should be calculated in this kernel. */ +/* ## */ +/* ## Setttings for pairs of interactions (e.g. 2nd i particle against 1st j particle) */ +/* ## PAIRS_IJ[]: Array with (i,j) tuples of pairs for which interactions */ +/* ## should be calculated in this kernel. Zero-charge particles */ +/* ## do not have interactions with particles without vdw, and */ +/* ## Vdw-only interactions are not evaluated in a no-vdw-kernel. */ +/* ## INTERACTION_FLAGS[][]: 2D matrix, dimension e.g. 3*3 for water-water interactions. */ +/* ## For each i-j pair, the element [I][J] is a list of strings */ +/* ## defining properties/flags of this interaction. Examples */ +/* ## include 'electrostatics'/'vdw' if that type of interaction */ +/* ## should be evaluated, 'rsq'/'rinv'/'rinvsq' if those values */ +/* ## are needed, and 'exactcutoff' or 'shift','switch' to */ +/* ## decide if the force/potential should be modified. This way */ +/* ## we only calculate values absolutely needed for each case. */ + +/* ## Calculate the size and offset for (merged/interleaved) table data */ + +/* + * Gromacs nonbonded kernel: {KERNEL_NAME} + * Electrostatics interaction: {KERNEL_ELEC} + * VdW interaction: {KERNEL_VDW} + * Geometry: {GEOMETRY_I}-{GEOMETRY_J} + * Calculate force/pot: {KERNEL_VF} + */ +void +{KERNEL_NAME} + (t_nblist * gmx_restrict nlist, + rvec * gmx_restrict xx, + rvec * gmx_restrict ff, + t_forcerec * gmx_restrict fr, + t_mdatoms * gmx_restrict mdatoms, + nb_kernel_data_t * gmx_restrict kernel_data, + t_nrnb * gmx_restrict nrnb) +{ + /* ## Not all variables are used for all kernels, but any optimizing compiler fixes that, */ + /* ## so there is no point in going to extremes to exclude variables that are not needed. */ + /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or + * just 0 for non-waters. + * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different + * jnr indices corresponding to data put in the four positions in the SIMD register. + */ + int i_shift_offset,i_coord_offset,outeriter,inneriter; + int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx; + int jnrA,jnrB; + int j_coord_offsetA,j_coord_offsetB; + int *iinr,*jindex,*jjnr,*shiftidx,*gid; + real rcutoff_scalar; + real *shiftvec,*fshift,*x,*f; + __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; + /* #for I in PARTICLES_I */ + int vdwioffset{I}; + __m128d ix{I},iy{I},iz{I},fix{I},fiy{I},fiz{I},iq{I},isai{I}; + /* #endfor */ + /* #for J in PARTICLES_J */ + int vdwjidx{J}A,vdwjidx{J}B; + __m128d jx{J},jy{J},jz{J},fjx{J},fjy{J},fjz{J},jq{J},isaj{J}; + /* #endfor */ + /* #for I,J in PAIRS_IJ */ + __m128d dx{I}{J},dy{I}{J},dz{I}{J},rsq{I}{J},rinv{I}{J},rinvsq{I}{J},r{I}{J},qq{I}{J},c6_{I}{J},c12_{I}{J}; + /* #endfor */ + /* #if KERNEL_ELEC != 'None' */ + __m128d velec,felec,velecsum,facel,crf,krf,krf2; + real *charge; + /* #endif */ + /* #if 'GeneralizedBorn' in KERNEL_ELEC */ + __m128i gbitab; + __m128d vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp; + __m128d minushalf = _mm_set1_pd(-0.5); + real *invsqrta,*dvda,*gbtab; + /* #endif */ + /* #if KERNEL_VDW != 'None' */ + int nvdwtype; + __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; + int *vdwtype; + real *vdwparam; + __m128d one_sixth = _mm_set1_pd(1.0/6.0); + __m128d one_twelfth = _mm_set1_pd(1.0/12.0); + /* #endif */ + /* #if 'Table' in KERNEL_ELEC or 'GeneralizedBorn' in KERNEL_ELEC or 'Table' in KERNEL_VDW */ + __m128i vfitab; + __m128i ifour = _mm_set1_epi32(4); + __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; + real *vftab; + /* #endif */ + /* #if 'Ewald' in KERNEL_ELEC */ + __m128i ewitab; + __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; + real *ewtab; + /* #endif */ + /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */ + __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; + real rswitch_scalar,d_scalar; + /* #endif */ + __m128d dummy_mask,cutoff_mask; + __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); + __m128d one = _mm_set1_pd(1.0); + __m128d two = _mm_set1_pd(2.0); + x = xx[0]; + f = ff[0]; + + nri = nlist->nri; + iinr = nlist->iinr; + jindex = nlist->jindex; + jjnr = nlist->jjnr; + shiftidx = nlist->shift; + gid = nlist->gid; + shiftvec = fr->shift_vec[0]; + fshift = fr->fshift[0]; + /* #if KERNEL_ELEC != 'None' */ + facel = _mm_set1_pd(fr->epsfac); + charge = mdatoms->chargeA; + /* #if 'ReactionField' in KERNEL_ELEC */ + krf = _mm_set1_pd(fr->ic->k_rf); + krf2 = _mm_set1_pd(fr->ic->k_rf*2.0); + crf = _mm_set1_pd(fr->ic->c_rf); + /* #endif */ + /* #endif */ + /* #if KERNEL_VDW != 'None' */ + nvdwtype = fr->ntype; + vdwparam = fr->nbfp; + vdwtype = mdatoms->typeA; + /* #endif */ + + /* #if 'Table' in KERNEL_ELEC and 'Table' in KERNEL_VDW */ + vftab = kernel_data->table_elec_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale); + /* #elif 'Table' in KERNEL_ELEC */ + vftab = kernel_data->table_elec->data; + vftabscale = _mm_set1_pd(kernel_data->table_elec->scale); + /* #elif 'Table' in KERNEL_VDW */ + vftab = kernel_data->table_vdw->data; + vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale); + /* #endif */ + + /* #if 'Ewald' in KERNEL_ELEC */ + sh_ewald = _mm_set1_pd(fr->ic->sh_ewald); + /* #if KERNEL_VF=='Force' and KERNEL_MOD_ELEC!='PotentialSwitch' */ + ewtab = fr->ic->tabq_coul_F; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + /* #else */ + ewtab = fr->ic->tabq_coul_FDV0; + ewtabscale = _mm_set1_pd(fr->ic->tabq_scale); + ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale); + /* #endif */ + /* #endif */ + + /* #if KERNEL_ELEC=='GeneralizedBorn' */ + invsqrta = fr->invsqrta; + dvda = fr->dvda; + gbtabscale = _mm_set1_pd(fr->gbtab.scale); + gbtab = fr->gbtab.data; + gbinvepsdiff = _mm_set1_pd((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent)); + /* #endif */ + + /* #if 'Water' in GEOMETRY_I */ + /* Setup water-specific parameters */ + inr = nlist->iinr[0]; + /* #for I in PARTICLES_ELEC_I */ + iq{I} = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+{I}])); + /* #endfor */ + /* #for I in PARTICLES_VDW_I */ + vdwioffset{I} = 2*nvdwtype*vdwtype[inr+{I}]; + /* #endfor */ + /* #endif */ + + /* #if 'Water' in GEOMETRY_J */ + /* #for J in PARTICLES_ELEC_J */ + jq{J} = _mm_set1_pd(charge[inr+{J}]); + /* #endfor */ + /* #for J in PARTICLES_VDW_J */ + vdwjidx{J}A = 2*vdwtype[inr+{J}]; + /* #endfor */ + /* #for I,J in PAIRS_IJ */ + /* #if 'electrostatics' in INTERACTION_FLAGS[I][J] */ + qq{I}{J} = _mm_mul_pd(iq{I},jq{J}); + /* #endif */ + /* #if 'vdw' in INTERACTION_FLAGS[I][J] */ + c6_{I}{J} = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A]); + c12_{I}{J} = _mm_set1_pd(vdwparam[vdwioffset{I}+vdwjidx{J}A+1]); + /* #endif */ + /* #endfor */ + /* #endif */ + + /* #if KERNEL_MOD_ELEC!='None' or KERNEL_MOD_VDW!='None' */ + /* #if KERNEL_ELEC!='None' */ + /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ + rcutoff_scalar = fr->rcoulomb; + /* #else */ + rcutoff_scalar = fr->rvdw; + /* #endif */ + rcutoff = _mm_set1_pd(rcutoff_scalar); + rcutoff2 = _mm_mul_pd(rcutoff,rcutoff); + /* #endif */ + + /* #if KERNEL_MOD_VDW=='PotentialShift' */ + sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6); + rvdw = _mm_set1_pd(fr->rvdw); + /* #endif */ + + /* #if 'PotentialSwitch' in [KERNEL_MOD_ELEC,KERNEL_MOD_VDW] */ + /* #if KERNEL_MOD_ELEC=='PotentialSwitch' */ + rswitch_scalar = fr->rcoulomb_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* #else */ + rswitch_scalar = fr->rvdw_switch; + rswitch = _mm_set1_pd(rswitch_scalar); + /* #endif */ + /* Setup switch parameters */ + d_scalar = rcutoff_scalar-rswitch_scalar; + d = _mm_set1_pd(d_scalar); + swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar)); + swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + /* #if 'Force' in KERNEL_VF */ + swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar)); + swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar)); + swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar)); + /* #endif */ + /* #endif */ + + /* Avoid stupid compiler warnings */ + jnrA = jnrB = 0; + j_coord_offsetA = 0; + j_coord_offsetB = 0; + + /* ## Keep track of the floating point operations we issue for reporting! */ + /* #define OUTERFLOPS 0 */ + outeriter = 0; + inneriter = 0; + + /* Start outer loop over neighborlists */ + for(iidx=0; iidxenergygrp_elec+ggid); + /* #define OUTERFLOPS OUTERFLOPS+1 */ + /* #endif */ + /* #if 'GeneralizedBorn' in KERNEL_ELEC */ + gmx_mm_update_1pot_pd(vgbsum,kernel_data->energygrp_polarization+ggid); + /* #define OUTERFLOPS OUTERFLOPS+1 */ + /* #endif */ + /* #if KERNEL_VDW != 'None' */ + gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid); + /* #define OUTERFLOPS OUTERFLOPS+1 */ + /* #endif */ + /* #endif */ + /* #if 'GeneralizedBorn' in KERNEL_ELEC and 'Force' in KERNEL_VF */ + dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai{I},isai{I})); + gmx_mm_update_1pot_pd(dvdasum,dvda+inr); + /* #endif */ + + /* Increment number of inner iterations */ + inneriter += j_index_end - j_index_start; + + /* Outer loop uses {OUTERFLOPS} flops */ + } + + /* Increment number of outer iterations */ + outeriter += nri; + + /* Update outer/inner flops */ + /* ## NB: This is not important, it just affects the flopcount. However, since our preprocessor is */ + /* ## primitive and replaces aggressively even in strings inside these directives, we need to */ + /* ## assemble the main part of the name (containing KERNEL/ELEC/VDW) directly in the source. */ + /* #if GEOMETRY_I == 'Water3' */ + /* #define ISUFFIX '_W3' */ + /* #elif GEOMETRY_I == 'Water4' */ + /* #define ISUFFIX '_W4' */ + /* #else */ + /* #define ISUFFIX '' */ + /* #endif */ + /* #if GEOMETRY_J == 'Water3' */ + /* #define JSUFFIX 'W3' */ + /* #elif GEOMETRY_J == 'Water4' */ + /* #define JSUFFIX 'W4' */ + /* #else */ + /* #define JSUFFIX '' */ + /* #endif */ + /* #if 'PotentialAndForce' in KERNEL_VF */ + /* #define VFSUFFIX '_VF' */ + /* #elif 'Potential' in KERNEL_VF */ + /* #define VFSUFFIX '_V' */ + /* #else */ + /* #define VFSUFFIX '_F' */ + /* #endif */ + + /* #if KERNEL_ELEC != 'None' and KERNEL_VDW != 'None' */ + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS}); + /* #elif KERNEL_ELEC != 'None' */ + inc_nrnb(nrnb,eNR_NBKERNEL_ELEC{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS}); + /* #else */ + inc_nrnb(nrnb,eNR_NBKERNEL_VDW{ISUFFIX}{JSUFFIX}{VFSUFFIX},outeriter*{OUTERFLOPS} + inneriter*{INNERFLOPS}); + /* #endif */ +} diff --git a/src/gmxlib/nonbonded/nonbonded.c b/src/gmxlib/nonbonded/nonbonded.c index 7607e0888f..75aae6193e 100644 --- a/src/gmxlib/nonbonded/nonbonded.c +++ b/src/gmxlib/nonbonded/nonbonded.c @@ -84,7 +84,9 @@ #if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE) # include "nb_kernel_avx_256_single/nb_kernel_avx_256_single.h" #endif - +#if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE) +# include "nb_kernel_sse2_double/nb_kernel_sse2_double.h" +#endif #ifdef GMX_THREAD_MPI static tMPI_Thread_mutex_t nonbonded_setup_mutex = TMPI_THREAD_MUTEX_INITIALIZER; @@ -111,6 +113,7 @@ gmx_nonbonded_setup(FILE * fplog, if(!(fr!=NULL && fr->use_cpu_acceleration==FALSE)) { /* Add interaction-specific kernels for different architectures */ + /* Single precision */ #if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE) nb_kernel_list_add_kernels(kernellist_sse2_single,kernellist_sse2_single_size); #endif @@ -123,6 +126,10 @@ gmx_nonbonded_setup(FILE * fplog, #if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE) nb_kernel_list_add_kernels(kernellist_avx_256_single,kernellist_avx_256_single_size); #endif + /* Double precision */ +#if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE) + nb_kernel_list_add_kernels(kernellist_sse2_double,kernellist_sse2_double_size); +#endif ; /* empty statement to avoid a completely empty block */ } } @@ -156,6 +163,7 @@ gmx_nonbonded_set_kernel_pointers(FILE *log, t_nblist *nl) } arch_and_padding[] = { + /* Single precision */ #if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE) { "avx_256_single", 8 }, #endif @@ -168,6 +176,14 @@ gmx_nonbonded_set_kernel_pointers(FILE *log, t_nblist *nl) #if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE) { "sse2_single", 4 }, #endif + /* Double precision */ +#if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE) + /* Sic. Double precision SSE2 does not require neighbor list padding, + * since the kernels execute a loop unrolled a factor 2, followed by + * a possible single odd-element epilogue. + */ + { "sse2_double", 1 }, +#endif { "c", 1 }, }; int narch = asize(arch_and_padding); -- 2.11.4.GIT