From 26f57216e9dadfb1dba644d60b552667469d183f Mon Sep 17 00:00:00 2001 From: Magnus Lundborg Date: Wed, 29 Nov 2017 13:26:49 +0100 Subject: [PATCH] SIMD accelerated Urey-Bradley Enable SIMD for Urey-Bradley without energies and shift forces. Change-Id: I8bb94ad4be6db41c0b3d8e820ceacc5c1005146e --- src/gromacs/listed-forces/bonded.cpp | 140 ++++++++++++++++++++++++++++ src/gromacs/listed-forces/bonded.h | 11 +++ src/gromacs/listed-forces/listed-forces.cpp | 12 +++ 3 files changed, 163 insertions(+) diff --git a/src/gromacs/listed-forces/bonded.cpp b/src/gromacs/listed-forces/bonded.cpp index f752d8b2c2..0c08d2f861 100644 --- a/src/gromacs/listed-forces/bonded.cpp +++ b/src/gromacs/listed-forces/bonded.cpp @@ -1295,6 +1295,146 @@ real urey_bradley(int nbonds, return vtot; } +#if GMX_SIMD_HAVE_REAL + +/* As urey_bradley, but using SIMD to calculate many potentials at once. + * This routines does not calculate energies and shift forces. + */ +void urey_bradley_noener_simd(int nbonds, + const t_iatom forceatoms[], const t_iparams forceparams[], + const rvec x[], rvec4 f[], + const t_pbc *pbc, const t_graph gmx_unused *g, + real gmx_unused lambda, + const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd, + int gmx_unused *global_atom_index) +{ + constexpr int nfa1 = 4; + GMX_ALIGNED(int, GMX_SIMD_REAL_WIDTH) ai[GMX_SIMD_REAL_WIDTH]; + GMX_ALIGNED(int, GMX_SIMD_REAL_WIDTH) aj[GMX_SIMD_REAL_WIDTH]; + GMX_ALIGNED(int, GMX_SIMD_REAL_WIDTH) ak[GMX_SIMD_REAL_WIDTH]; + GMX_ALIGNED(real, GMX_SIMD_REAL_WIDTH) coeff[4*GMX_SIMD_REAL_WIDTH]; + GMX_ALIGNED(real, GMX_SIMD_REAL_WIDTH) pbc_simd[9*GMX_SIMD_REAL_WIDTH]; + + set_pbc_simd(pbc, pbc_simd); + + /* nbonds is the number of angles times nfa1, here we step GMX_SIMD_REAL_WIDTH angles */ + for (int i = 0; i < nbonds; i += GMX_SIMD_REAL_WIDTH*nfa1) + { + /* Collect atoms for GMX_SIMD_REAL_WIDTH angles. + * iu indexes into forceatoms, we should not let iu go beyond nbonds. + */ + int iu = i; + for (int s = 0; s < GMX_SIMD_REAL_WIDTH; s++) + { + const int type = forceatoms[iu]; + ai[s] = forceatoms[iu+1]; + aj[s] = forceatoms[iu+2]; + ak[s] = forceatoms[iu+3]; + + /* At the end fill the arrays with the last atoms and 0 params */ + if (i + s*nfa1 < nbonds) + { + coeff[s] = forceparams[type].u_b.kthetaA; + coeff[GMX_SIMD_REAL_WIDTH+s] = forceparams[type].u_b.thetaA; + coeff[GMX_SIMD_REAL_WIDTH*2+s] = forceparams[type].u_b.kUBA; + coeff[GMX_SIMD_REAL_WIDTH*3+s] = forceparams[type].u_b.r13A; + + if (iu + nfa1 < nbonds) + { + iu += nfa1; + } + } + else + { + coeff[s] = 0; + coeff[GMX_SIMD_REAL_WIDTH+s] = 0; + coeff[GMX_SIMD_REAL_WIDTH*2+s] = 0; + coeff[GMX_SIMD_REAL_WIDTH*3+s] = 0; + } + } + + SimdReal xi_S, yi_S, zi_S; + SimdReal xj_S, yj_S, zj_S; + SimdReal xk_S, yk_S, zk_S; + + /* Store the non PBC corrected distances packed and aligned */ + gatherLoadUTranspose<3>(reinterpret_cast(x), ai, &xi_S, &yi_S, &zi_S); + gatherLoadUTranspose<3>(reinterpret_cast(x), aj, &xj_S, &yj_S, &zj_S); + gatherLoadUTranspose<3>(reinterpret_cast(x), ak, &xk_S, &yk_S, &zk_S); + SimdReal rijx_S = xi_S - xj_S; + SimdReal rijy_S = yi_S - yj_S; + SimdReal rijz_S = zi_S - zj_S; + SimdReal rkjx_S = xk_S - xj_S; + SimdReal rkjy_S = yk_S - yj_S; + SimdReal rkjz_S = zk_S - zj_S; + SimdReal rikx_S = xi_S - xk_S; + SimdReal riky_S = yi_S - yk_S; + SimdReal rikz_S = zi_S - zk_S; + + const SimdReal ktheta_S = load(coeff); + const SimdReal theta0_S = load(coeff+GMX_SIMD_REAL_WIDTH) * DEG2RAD; + const SimdReal kUB_S = load(coeff+2*GMX_SIMD_REAL_WIDTH); + const SimdReal r13_S = load(coeff+3*GMX_SIMD_REAL_WIDTH); + + pbc_correct_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc_simd); + pbc_correct_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc_simd); + pbc_correct_dx_simd(&rikx_S, &riky_S, &rikz_S, pbc_simd); + + const SimdReal rij_rkj_S = iprod(rijx_S, rijy_S, rijz_S, + rkjx_S, rkjy_S, rkjz_S); + + const SimdReal dr2_S = iprod(rikx_S, riky_S, rikz_S, + rikx_S, riky_S, rikz_S); + + const SimdReal nrij2_S = norm2(rijx_S, rijy_S, rijz_S); + const SimdReal nrkj2_S = norm2(rkjx_S, rkjy_S, rkjz_S); + + const SimdReal nrij_1_S = invsqrt(nrij2_S); + const SimdReal nrkj_1_S = invsqrt(nrkj2_S); + const SimdReal invdr2_S = invsqrt(dr2_S); + const SimdReal dr_S = dr2_S*invdr2_S; + + constexpr real min_one_plus_eps = -1.0 + 2.0*GMX_REAL_EPS; // Smallest number > -1 + + /* To allow for 180 degrees, we take the max of cos and -1 + 1bit, + * so we can safely get the 1/sin from 1/sqrt(1 - cos^2). + * This also ensures that rounding errors would cause the argument + * of simdAcos to be < -1. + * Note that we do not take precautions for cos(0)=1, so the bonds + * in an angle should not align at an angle of 0 degrees. + */ + const SimdReal cos_S = max(rij_rkj_S * nrij_1_S * nrkj_1_S, min_one_plus_eps); + + const SimdReal theta_S = acos(cos_S); + const SimdReal invsin_S = invsqrt( 1.0 - cos_S * cos_S ); + const SimdReal st_S = ktheta_S * (theta0_S - theta_S) * invsin_S; + const SimdReal sth_S = st_S * cos_S; + + const SimdReal cik_S = st_S * nrij_1_S * nrkj_1_S; + const SimdReal cii_S = sth_S * nrij_1_S * nrij_1_S; + const SimdReal ckk_S = sth_S * nrkj_1_S * nrkj_1_S; + + const SimdReal sUB_S = kUB_S * (r13_S - dr_S) * invdr2_S; + + const SimdReal f_ikx_S = sUB_S * rikx_S; + const SimdReal f_iky_S = sUB_S * riky_S; + const SimdReal f_ikz_S = sUB_S * rikz_S; + + const SimdReal f_ix_S = fnma(cik_S, rkjx_S, cii_S * rijx_S) + f_ikx_S; + const SimdReal f_iy_S = fnma(cik_S, rkjy_S, cii_S * rijy_S) + f_iky_S; + const SimdReal f_iz_S = fnma(cik_S, rkjz_S, cii_S * rijz_S) + f_ikz_S; + const SimdReal f_kx_S = fnma(cik_S, rijx_S, ckk_S * rkjx_S) - f_ikx_S; + const SimdReal f_ky_S = fnma(cik_S, rijy_S, ckk_S * rkjy_S) - f_iky_S; + const SimdReal f_kz_S = fnma(cik_S, rijz_S, ckk_S * rkjz_S) - f_ikz_S; + + transposeScatterIncrU<4>(reinterpret_cast(f), ai, f_ix_S, f_iy_S, f_iz_S); + transposeScatterDecrU<4>(reinterpret_cast(f), aj, f_ix_S + f_kx_S, f_iy_S + f_ky_S, f_iz_S + f_kz_S); + transposeScatterIncrU<4>(reinterpret_cast(f), ak, f_kx_S, f_ky_S, f_kz_S); + } +} + +#endif // GMX_SIMD_HAVE_REAL + real quartic_angles(int nbonds, const t_iatom forceatoms[], const t_iparams forceparams[], const rvec x[], rvec4 f[], rvec fshift[], diff --git a/src/gromacs/listed-forces/bonded.h b/src/gromacs/listed-forces/bonded.h index 2b5d737296..840c779c84 100644 --- a/src/gromacs/listed-forces/bonded.h +++ b/src/gromacs/listed-forces/bonded.h @@ -136,6 +136,17 @@ void const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd, int gmx_unused *global_atom_index); +/* As urey_bradley, but using SIMD to calculate many potentials at once. + * This routines does not calculate energies and shift forces. + */ +void urey_bradley_noener_simd(int nbonds, + const t_iatom forceatoms[], const t_iparams forceparams[], + const rvec x[], rvec4 f[], + const t_pbc *pbc, const t_graph gmx_unused *g, + real gmx_unused lambda, + const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd, + int gmx_unused *global_atom_index); + /* As pdihs_noener(), but using SIMD to calculate many dihedrals at once. */ void pdihs_noener_simd(int nbonds, diff --git a/src/gromacs/listed-forces/listed-forces.cpp b/src/gromacs/listed-forces/listed-forces.cpp index 54ed33c5a8..0de7f7b640 100644 --- a/src/gromacs/listed-forces/listed-forces.cpp +++ b/src/gromacs/listed-forces/listed-forces.cpp @@ -320,6 +320,18 @@ calc_one_bond(int thread, global_atom_index); v = 0; } + + else if (ftype == F_UREY_BRADLEY && bUseSIMD && + !bCalcEnerVir && fr->efep == efepNO) + { + /* No energies, shift forces, dvdl */ + urey_bradley_noener_simd(nbn, idef->il[ftype].iatoms+nb0, + idef->iparams, + x, f, + pbc, g, lambda[efptFTYPE], md, fcd, + global_atom_index); + v = 0; + } #endif else if (ftype == F_PDIHS && !bCalcEnerVir && fr->efep == efepNO) -- 2.11.4.GIT