Improve workload data structures' docs
[gromacs.git] / src / gromacs / ewald / pme_solve.cpp
blob23d181367e0eac9ba05916eec9927ede11ede298
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "gmxpre.h"
40 #include "pme_solve.h"
42 #include <cmath>
44 #include "gromacs/fft/parallel_3dfft.h"
45 #include "gromacs/math/units.h"
46 #include "gromacs/math/utilities.h"
47 #include "gromacs/math/vec.h"
48 #include "gromacs/simd/simd.h"
49 #include "gromacs/simd/simd_math.h"
50 #include "gromacs/utility/arrayref.h"
51 #include "gromacs/utility/exceptions.h"
52 #include "gromacs/utility/smalloc.h"
54 #include "pme_internal.h"
56 #if GMX_SIMD_HAVE_REAL
57 /* Turn on arbitrary width SIMD intrinsics for PME solve */
58 # define PME_SIMD_SOLVE
59 #endif
61 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
63 struct pme_solve_work_t
65 /* work data for solve_pme */
66 int nalloc;
67 real * mhx;
68 real * mhy;
69 real * mhz;
70 real * m2;
71 real * denom;
72 real * tmp1_alloc;
73 real * tmp1;
74 real * tmp2;
75 real * eterm;
76 real * m2inv;
78 real energy_q;
79 matrix vir_q;
80 real energy_lj;
81 matrix vir_lj;
84 #ifdef PME_SIMD_SOLVE
85 constexpr int c_simdWidth = GMX_SIMD_REAL_WIDTH;
86 #else
87 /* We can use any alignment > 0, so we use 4 */
88 constexpr int c_simdWidth = 4;
89 #endif
91 /* Returns the smallest number >= \p that is a multiple of \p factor, \p factor must be a power of 2 */
92 template <unsigned int factor>
93 static size_t roundUpToMultipleOfFactor(size_t number)
95 static_assert(factor > 0 && (factor & (factor - 1)) == 0, "factor should be >0 and a power of 2");
97 /* We need to add a most factor-1 and because factor is a power of 2,
98 * we get the result by masking out the bits corresponding to factor-1.
100 return (number + factor - 1) & ~(factor - 1);
103 /* Allocate an aligned pointer for SIMD operations, including extra elements
104 * at the end for padding.
106 /* TODO: Replace this SIMD reallocator with a general, C++ solution */
107 static void reallocSimdAlignedAndPadded(real **ptr, int unpaddedNumElements)
109 sfree_aligned(*ptr);
110 snew_aligned(*ptr, roundUpToMultipleOfFactor<c_simdWidth>(unpaddedNumElements), c_simdWidth*sizeof(real));
113 static void realloc_work(struct pme_solve_work_t *work, int nkx)
115 if (nkx > work->nalloc)
117 work->nalloc = nkx;
118 srenew(work->mhx, work->nalloc);
119 srenew(work->mhy, work->nalloc);
120 srenew(work->mhz, work->nalloc);
121 srenew(work->m2, work->nalloc);
122 reallocSimdAlignedAndPadded(&work->denom, work->nalloc);
123 reallocSimdAlignedAndPadded(&work->tmp1, work->nalloc);
124 reallocSimdAlignedAndPadded(&work->tmp2, work->nalloc);
125 reallocSimdAlignedAndPadded(&work->eterm, work->nalloc);
126 srenew(work->m2inv, work->nalloc);
128 /* Init all allocated elements of denom to 1 to avoid 1/0 exceptions
129 * of simd padded elements.
131 for (size_t i = 0; i < roundUpToMultipleOfFactor<c_simdWidth>(work->nalloc ); i++)
133 work->denom[i] = 1;
138 void pme_init_all_work(struct pme_solve_work_t **work, int nthread, int nkx)
140 /* Use fft5d, order after FFT is y major, z, x minor */
142 snew(*work, nthread);
143 /* Allocate the work arrays thread local to optimize memory access */
144 #pragma omp parallel for num_threads(nthread) schedule(static)
145 for (int thread = 0; thread < nthread; thread++)
149 realloc_work(&((*work)[thread]), nkx);
151 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
155 static void free_work(struct pme_solve_work_t *work)
157 if (work)
159 sfree(work->mhx);
160 sfree(work->mhy);
161 sfree(work->mhz);
162 sfree(work->m2);
163 sfree_aligned(work->denom);
164 sfree_aligned(work->tmp1);
165 sfree_aligned(work->tmp2);
166 sfree_aligned(work->eterm);
167 sfree(work->m2inv);
171 void pme_free_all_work(struct pme_solve_work_t **work, int nthread)
173 if (*work)
175 for (int thread = 0; thread < nthread; thread++)
177 free_work(&(*work)[thread]);
180 sfree(*work);
181 *work = nullptr;
184 void get_pme_ener_vir_q(pme_solve_work_t *work, int nthread, PmeOutput *output)
186 GMX_ASSERT(output != nullptr, "Need valid output buffer");
187 /* This function sums output over threads and should therefore
188 * only be called after thread synchronization.
190 output->coulombEnergy_ = work[0].energy_q;
191 copy_mat(work[0].vir_q, output->coulombVirial_);
193 for (int thread = 1; thread < nthread; thread++)
195 output->coulombEnergy_ += work[thread].energy_q;
196 m_add(output->coulombVirial_, work[thread].vir_q, output->coulombVirial_);
200 void get_pme_ener_vir_lj(pme_solve_work_t *work, int nthread, PmeOutput *output)
202 GMX_ASSERT(output != nullptr, "Need valid output buffer");
203 /* This function sums output over threads and should therefore
204 * only be called after thread synchronization.
206 output->lennardJonesEnergy_ = work[0].energy_lj;
207 copy_mat(work[0].vir_lj, output->lennardJonesVirial_);
209 for (int thread = 1; thread < nthread; thread++)
211 output->lennardJonesEnergy_ += work[thread].energy_lj;
212 m_add(output->lennardJonesVirial_, work[thread].vir_lj, output->lennardJonesVirial_);
216 #if defined PME_SIMD_SOLVE
217 /* Calculate exponentials through SIMD */
218 inline static void calc_exponentials_q(int /*unused*/, int /*unused*/, real f, ArrayRef<const SimdReal> d_aligned, ArrayRef<const SimdReal> r_aligned, ArrayRef<SimdReal> e_aligned)
221 SimdReal f_simd(f);
222 SimdReal tmp_d1, tmp_r, tmp_e;
224 /* We only need to calculate from start. But since start is 0 or 1
225 * and we want to use aligned loads/stores, we always start from 0.
227 GMX_ASSERT(d_aligned.size() == r_aligned.size(), "d and r must have same size");
228 GMX_ASSERT(d_aligned.size() == e_aligned.size(), "d and e must have same size");
229 for (size_t kx = 0; kx != d_aligned.size(); ++kx)
231 tmp_d1 = d_aligned[kx];
232 tmp_r = r_aligned[kx];
233 tmp_r = gmx::exp(tmp_r);
234 tmp_e = f_simd / tmp_d1;
235 tmp_e = tmp_e * tmp_r;
236 e_aligned[kx] = tmp_e;
240 #else
241 inline static void calc_exponentials_q(int start, int end, real f, ArrayRef<real> d, ArrayRef<real> r, ArrayRef<real> e)
243 GMX_ASSERT(d.size() == r.size(), "d and r must have same size");
244 GMX_ASSERT(d.size() == e.size(), "d and e must have same size");
245 int kx;
246 for (kx = start; kx < end; kx++)
248 d[kx] = 1.0/d[kx];
250 for (kx = start; kx < end; kx++)
252 r[kx] = std::exp(r[kx]);
254 for (kx = start; kx < end; kx++)
256 e[kx] = f*r[kx]*d[kx];
259 #endif
261 #if defined PME_SIMD_SOLVE
262 /* Calculate exponentials through SIMD */
263 inline static void calc_exponentials_lj(int /*unused*/, int /*unused*/, ArrayRef<SimdReal> r_aligned, ArrayRef<SimdReal> factor_aligned, ArrayRef<SimdReal> d_aligned)
265 SimdReal tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
266 const SimdReal sqr_PI = sqrt(SimdReal(M_PI));
268 GMX_ASSERT(d_aligned.size() == r_aligned.size(), "d and r must have same size");
269 GMX_ASSERT(d_aligned.size() == factor_aligned.size(), "d and factor must have same size");
270 for (size_t kx = 0; kx != d_aligned.size(); ++kx)
272 /* We only need to calculate from start. But since start is 0 or 1
273 * and we want to use aligned loads/stores, we always start from 0.
275 tmp_d = d_aligned[kx];
276 d_inv = SimdReal(1.0) / tmp_d;
277 d_aligned[kx] = d_inv;
278 tmp_r = r_aligned[kx];
279 tmp_r = gmx::exp(tmp_r);
280 r_aligned[kx] = tmp_r;
281 tmp_mk = factor_aligned[kx];
282 tmp_fac = sqr_PI * tmp_mk * erfc(tmp_mk);
283 factor_aligned[kx] = tmp_fac;
286 #else
287 inline static void calc_exponentials_lj(int start, int end, ArrayRef<real> r, ArrayRef<real> tmp2, ArrayRef<real> d)
289 int kx;
290 real mk;
291 GMX_ASSERT(d.size() == r.size(), "d and r must have same size");
292 GMX_ASSERT(d.size() == tmp2.size(), "d and tmp2 must have same size");
293 for (kx = start; kx < end; kx++)
295 d[kx] = 1.0/d[kx];
298 for (kx = start; kx < end; kx++)
300 r[kx] = std::exp(r[kx]);
303 for (kx = start; kx < end; kx++)
305 mk = tmp2[kx];
306 tmp2[kx] = sqrt(M_PI)*mk*std::erfc(mk);
309 #endif
311 #if defined PME_SIMD_SOLVE
312 using PME_T = SimdReal;
313 #else
314 using PME_T = real;
315 #endif
317 int solve_pme_yzx(const gmx_pme_t *pme, t_complex *grid, real vol,
318 gmx_bool bEnerVir,
319 int nthread, int thread)
321 /* do recip sum over local cells in grid */
322 /* y major, z middle, x minor or continuous */
323 t_complex *p0;
324 int kx, ky, kz, maxkx, maxky;
325 int nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
326 real mx, my, mz;
327 real ewaldcoeff = pme->ewaldcoeff_q;
328 real factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
329 real ets2, struct2, vfactor, ets2vf;
330 real d1, d2, energy = 0;
331 real by, bz;
332 real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
333 real rxx, ryx, ryy, rzx, rzy, rzz;
334 struct pme_solve_work_t *work;
335 real *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
336 real mhxk, mhyk, mhzk, m2k;
337 real corner_fac;
338 ivec complex_order;
339 ivec local_ndata, local_offset, local_size;
340 real elfac;
342 elfac = ONE_4PI_EPS0/pme->epsilon_r;
344 nx = pme->nkx;
345 ny = pme->nky;
346 nz = pme->nkz;
348 /* Dimensions should be identical for A/B grid, so we just use A here */
349 gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_QA],
350 complex_order,
351 local_ndata,
352 local_offset,
353 local_size);
355 rxx = pme->recipbox[XX][XX];
356 ryx = pme->recipbox[YY][XX];
357 ryy = pme->recipbox[YY][YY];
358 rzx = pme->recipbox[ZZ][XX];
359 rzy = pme->recipbox[ZZ][YY];
360 rzz = pme->recipbox[ZZ][ZZ];
362 GMX_ASSERT(rxx != 0.0, "Someone broke the reciprocal box again");
364 maxkx = (nx+1)/2;
365 maxky = (ny+1)/2;
367 work = &pme->solve_work[thread];
368 mhx = work->mhx;
369 mhy = work->mhy;
370 mhz = work->mhz;
371 m2 = work->m2;
372 denom = work->denom;
373 tmp1 = work->tmp1;
374 eterm = work->eterm;
375 m2inv = work->m2inv;
377 iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
378 iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
380 for (iyz = iyz0; iyz < iyz1; iyz++)
382 iy = iyz/local_ndata[ZZ];
383 iz = iyz - iy*local_ndata[ZZ];
385 ky = iy + local_offset[YY];
387 if (ky < maxky)
389 my = ky;
391 else
393 my = (ky - ny);
396 by = M_PI*vol*pme->bsp_mod[YY][ky];
398 kz = iz + local_offset[ZZ];
400 mz = kz;
402 bz = pme->bsp_mod[ZZ][kz];
404 /* 0.5 correction for corner points */
405 corner_fac = 1;
406 if (kz == 0 || kz == (nz+1)/2)
408 corner_fac = 0.5;
411 p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
413 /* We should skip the k-space point (0,0,0) */
414 /* Note that since here x is the minor index, local_offset[XX]=0 */
415 if (local_offset[XX] > 0 || ky > 0 || kz > 0)
417 kxstart = local_offset[XX];
419 else
421 kxstart = local_offset[XX] + 1;
422 p0++;
424 kxend = local_offset[XX] + local_ndata[XX];
426 if (bEnerVir)
428 /* More expensive inner loop, especially because of the storage
429 * of the mh elements in array's.
430 * Because x is the minor grid index, all mh elements
431 * depend on kx for triclinic unit cells.
434 /* Two explicit loops to avoid a conditional inside the loop */
435 for (kx = kxstart; kx < maxkx; kx++)
437 mx = kx;
439 mhxk = mx * rxx;
440 mhyk = mx * ryx + my * ryy;
441 mhzk = mx * rzx + my * rzy + mz * rzz;
442 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
443 mhx[kx] = mhxk;
444 mhy[kx] = mhyk;
445 mhz[kx] = mhzk;
446 m2[kx] = m2k;
447 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
448 tmp1[kx] = -factor*m2k;
451 for (kx = maxkx; kx < kxend; kx++)
453 mx = (kx - nx);
455 mhxk = mx * rxx;
456 mhyk = mx * ryx + my * ryy;
457 mhzk = mx * rzx + my * rzy + mz * rzz;
458 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
459 mhx[kx] = mhxk;
460 mhy[kx] = mhyk;
461 mhz[kx] = mhzk;
462 m2[kx] = m2k;
463 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
464 tmp1[kx] = -factor*m2k;
467 for (kx = kxstart; kx < kxend; kx++)
469 m2inv[kx] = 1.0/m2[kx];
472 calc_exponentials_q(kxstart, kxend, elfac,
473 ArrayRef<PME_T>(denom, denom+roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
474 ArrayRef<PME_T>(tmp1, tmp1+roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
475 ArrayRef<PME_T>(eterm, eterm+roundUpToMultipleOfFactor<c_simdWidth>(kxend)));
477 for (kx = kxstart; kx < kxend; kx++, p0++)
479 d1 = p0->re;
480 d2 = p0->im;
482 p0->re = d1*eterm[kx];
483 p0->im = d2*eterm[kx];
485 struct2 = 2.0*(d1*d1+d2*d2);
487 tmp1[kx] = eterm[kx]*struct2;
490 for (kx = kxstart; kx < kxend; kx++)
492 ets2 = corner_fac*tmp1[kx];
493 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
494 energy += ets2;
496 ets2vf = ets2*vfactor;
497 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
498 virxy += ets2vf*mhx[kx]*mhy[kx];
499 virxz += ets2vf*mhx[kx]*mhz[kx];
500 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
501 viryz += ets2vf*mhy[kx]*mhz[kx];
502 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
505 else
507 /* We don't need to calculate the energy and the virial.
508 * In this case the triclinic overhead is small.
511 /* Two explicit loops to avoid a conditional inside the loop */
513 for (kx = kxstart; kx < maxkx; kx++)
515 mx = kx;
517 mhxk = mx * rxx;
518 mhyk = mx * ryx + my * ryy;
519 mhzk = mx * rzx + my * rzy + mz * rzz;
520 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
521 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
522 tmp1[kx] = -factor*m2k;
525 for (kx = maxkx; kx < kxend; kx++)
527 mx = (kx - nx);
529 mhxk = mx * rxx;
530 mhyk = mx * ryx + my * ryy;
531 mhzk = mx * rzx + my * rzy + mz * rzz;
532 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
533 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
534 tmp1[kx] = -factor*m2k;
537 calc_exponentials_q(kxstart, kxend, elfac,
538 ArrayRef<PME_T>(denom, denom+roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
539 ArrayRef<PME_T>(tmp1, tmp1+roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
540 ArrayRef<PME_T>(eterm, eterm+roundUpToMultipleOfFactor<c_simdWidth>(kxend)));
543 for (kx = kxstart; kx < kxend; kx++, p0++)
545 d1 = p0->re;
546 d2 = p0->im;
548 p0->re = d1*eterm[kx];
549 p0->im = d2*eterm[kx];
554 if (bEnerVir)
556 /* Update virial with local values.
557 * The virial is symmetric by definition.
558 * this virial seems ok for isotropic scaling, but I'm
559 * experiencing problems on semiisotropic membranes.
560 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
562 work->vir_q[XX][XX] = 0.25*virxx;
563 work->vir_q[YY][YY] = 0.25*viryy;
564 work->vir_q[ZZ][ZZ] = 0.25*virzz;
565 work->vir_q[XX][YY] = work->vir_q[YY][XX] = 0.25*virxy;
566 work->vir_q[XX][ZZ] = work->vir_q[ZZ][XX] = 0.25*virxz;
567 work->vir_q[YY][ZZ] = work->vir_q[ZZ][YY] = 0.25*viryz;
569 /* This energy should be corrected for a charged system */
570 work->energy_q = 0.5*energy;
573 /* Return the loop count */
574 return local_ndata[YY]*local_ndata[XX];
577 int solve_pme_lj_yzx(const gmx_pme_t *pme, t_complex **grid, gmx_bool bLB, real vol,
578 gmx_bool bEnerVir, int nthread, int thread)
580 /* do recip sum over local cells in grid */
581 /* y major, z middle, x minor or continuous */
582 int ig, gcount;
583 int kx, ky, kz, maxkx, maxky;
584 int nx, ny, nz, iy, iyz0, iyz1, iyz, iz, kxstart, kxend;
585 real mx, my, mz;
586 real ewaldcoeff = pme->ewaldcoeff_lj;
587 real factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
588 real ets2, ets2vf;
589 real eterm, vterm, d1, d2, energy = 0;
590 real by, bz;
591 real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
592 real rxx, ryx, ryy, rzx, rzy, rzz;
593 real *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *tmp2;
594 real mhxk, mhyk, mhzk, m2k;
595 struct pme_solve_work_t *work;
596 real corner_fac;
597 ivec complex_order;
598 ivec local_ndata, local_offset, local_size;
599 nx = pme->nkx;
600 ny = pme->nky;
601 nz = pme->nkz;
603 /* Dimensions should be identical for A/B grid, so we just use A here */
604 gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_C6A],
605 complex_order,
606 local_ndata,
607 local_offset,
608 local_size);
609 rxx = pme->recipbox[XX][XX];
610 ryx = pme->recipbox[YY][XX];
611 ryy = pme->recipbox[YY][YY];
612 rzx = pme->recipbox[ZZ][XX];
613 rzy = pme->recipbox[ZZ][YY];
614 rzz = pme->recipbox[ZZ][ZZ];
616 maxkx = (nx+1)/2;
617 maxky = (ny+1)/2;
619 work = &pme->solve_work[thread];
620 mhx = work->mhx;
621 mhy = work->mhy;
622 mhz = work->mhz;
623 m2 = work->m2;
624 denom = work->denom;
625 tmp1 = work->tmp1;
626 tmp2 = work->tmp2;
628 iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
629 iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
631 for (iyz = iyz0; iyz < iyz1; iyz++)
633 iy = iyz/local_ndata[ZZ];
634 iz = iyz - iy*local_ndata[ZZ];
636 ky = iy + local_offset[YY];
638 if (ky < maxky)
640 my = ky;
642 else
644 my = (ky - ny);
647 by = 3.0*vol*pme->bsp_mod[YY][ky]
648 / (M_PI*sqrt(M_PI)*ewaldcoeff*ewaldcoeff*ewaldcoeff);
650 kz = iz + local_offset[ZZ];
652 mz = kz;
654 bz = pme->bsp_mod[ZZ][kz];
656 /* 0.5 correction for corner points */
657 corner_fac = 1;
658 if (kz == 0 || kz == (nz+1)/2)
660 corner_fac = 0.5;
663 kxstart = local_offset[XX];
664 kxend = local_offset[XX] + local_ndata[XX];
665 if (bEnerVir)
667 /* More expensive inner loop, especially because of the
668 * storage of the mh elements in array's. Because x is the
669 * minor grid index, all mh elements depend on kx for
670 * triclinic unit cells.
673 /* Two explicit loops to avoid a conditional inside the loop */
674 for (kx = kxstart; kx < maxkx; kx++)
676 mx = kx;
678 mhxk = mx * rxx;
679 mhyk = mx * ryx + my * ryy;
680 mhzk = mx * rzx + my * rzy + mz * rzz;
681 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
682 mhx[kx] = mhxk;
683 mhy[kx] = mhyk;
684 mhz[kx] = mhzk;
685 m2[kx] = m2k;
686 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
687 tmp1[kx] = -factor*m2k;
688 tmp2[kx] = sqrt(factor*m2k);
691 for (kx = maxkx; kx < kxend; kx++)
693 mx = (kx - nx);
695 mhxk = mx * rxx;
696 mhyk = mx * ryx + my * ryy;
697 mhzk = mx * rzx + my * rzy + mz * rzz;
698 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
699 mhx[kx] = mhxk;
700 mhy[kx] = mhyk;
701 mhz[kx] = mhzk;
702 m2[kx] = m2k;
703 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
704 tmp1[kx] = -factor*m2k;
705 tmp2[kx] = sqrt(factor*m2k);
707 /* Clear padding elements to avoid (harmless) fp exceptions */
708 const int kxendSimd = roundUpToMultipleOfFactor<c_simdWidth>(kxend);
709 for (; kx < kxendSimd; kx++)
711 tmp1[kx] = 0;
712 tmp2[kx] = 0;
715 calc_exponentials_lj(kxstart, kxend,
716 ArrayRef<PME_T>(tmp1, tmp1+roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
717 ArrayRef<PME_T>(tmp2, tmp2+roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
718 ArrayRef<PME_T>(denom, denom+roundUpToMultipleOfFactor<c_simdWidth>(kxend)));
720 for (kx = kxstart; kx < kxend; kx++)
722 m2k = factor*m2[kx];
723 eterm = -((1.0 - 2.0*m2k)*tmp1[kx]
724 + 2.0*m2k*tmp2[kx]);
725 vterm = 3.0*(-tmp1[kx] + tmp2[kx]);
726 tmp1[kx] = eterm*denom[kx];
727 tmp2[kx] = vterm*denom[kx];
730 if (!bLB)
732 t_complex *p0;
733 real struct2;
735 p0 = grid[0] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
736 for (kx = kxstart; kx < kxend; kx++, p0++)
738 d1 = p0->re;
739 d2 = p0->im;
741 eterm = tmp1[kx];
742 vterm = tmp2[kx];
743 p0->re = d1*eterm;
744 p0->im = d2*eterm;
746 struct2 = 2.0*(d1*d1+d2*d2);
748 tmp1[kx] = eterm*struct2;
749 tmp2[kx] = vterm*struct2;
752 else
754 real *struct2 = denom;
755 real str2;
757 for (kx = kxstart; kx < kxend; kx++)
759 struct2[kx] = 0.0;
761 /* Due to symmetry we only need to calculate 4 of the 7 terms */
762 for (ig = 0; ig <= 3; ++ig)
764 t_complex *p0, *p1;
765 real scale;
767 p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
768 p1 = grid[6-ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
769 scale = 2.0*lb_scale_factor_symm[ig];
770 for (kx = kxstart; kx < kxend; ++kx, ++p0, ++p1)
772 struct2[kx] += scale*(p0->re*p1->re + p0->im*p1->im);
776 for (ig = 0; ig <= 6; ++ig)
778 t_complex *p0;
780 p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
781 for (kx = kxstart; kx < kxend; kx++, p0++)
783 d1 = p0->re;
784 d2 = p0->im;
786 eterm = tmp1[kx];
787 p0->re = d1*eterm;
788 p0->im = d2*eterm;
791 for (kx = kxstart; kx < kxend; kx++)
793 eterm = tmp1[kx];
794 vterm = tmp2[kx];
795 str2 = struct2[kx];
796 tmp1[kx] = eterm*str2;
797 tmp2[kx] = vterm*str2;
801 for (kx = kxstart; kx < kxend; kx++)
803 ets2 = corner_fac*tmp1[kx];
804 vterm = 2.0*factor*tmp2[kx];
805 energy += ets2;
806 ets2vf = corner_fac*vterm;
807 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
808 virxy += ets2vf*mhx[kx]*mhy[kx];
809 virxz += ets2vf*mhx[kx]*mhz[kx];
810 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
811 viryz += ets2vf*mhy[kx]*mhz[kx];
812 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
815 else
817 /* We don't need to calculate the energy and the virial.
818 * In this case the triclinic overhead is small.
821 /* Two explicit loops to avoid a conditional inside the loop */
823 for (kx = kxstart; kx < maxkx; kx++)
825 mx = kx;
827 mhxk = mx * rxx;
828 mhyk = mx * ryx + my * ryy;
829 mhzk = mx * rzx + my * rzy + mz * rzz;
830 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
831 m2[kx] = m2k;
832 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
833 tmp1[kx] = -factor*m2k;
834 tmp2[kx] = sqrt(factor*m2k);
837 for (kx = maxkx; kx < kxend; kx++)
839 mx = (kx - nx);
841 mhxk = mx * rxx;
842 mhyk = mx * ryx + my * ryy;
843 mhzk = mx * rzx + my * rzy + mz * rzz;
844 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
845 m2[kx] = m2k;
846 denom[kx] = bz*by*pme->bsp_mod[XX][kx];
847 tmp1[kx] = -factor*m2k;
848 tmp2[kx] = sqrt(factor*m2k);
850 /* Clear padding elements to avoid (harmless) fp exceptions */
851 const int kxendSimd = roundUpToMultipleOfFactor<c_simdWidth>(kxend);
852 for (; kx < kxendSimd; kx++)
854 tmp1[kx] = 0;
855 tmp2[kx] = 0;
858 calc_exponentials_lj(kxstart, kxend,
859 ArrayRef<PME_T>(tmp1, tmp1+roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
860 ArrayRef<PME_T>(tmp2, tmp2+roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
861 ArrayRef<PME_T>(denom, denom+roundUpToMultipleOfFactor<c_simdWidth>(kxend)));
863 for (kx = kxstart; kx < kxend; kx++)
865 m2k = factor*m2[kx];
866 eterm = -((1.0 - 2.0*m2k)*tmp1[kx]
867 + 2.0*m2k*tmp2[kx]);
868 tmp1[kx] = eterm*denom[kx];
870 gcount = (bLB ? 7 : 1);
871 for (ig = 0; ig < gcount; ++ig)
873 t_complex *p0;
875 p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
876 for (kx = kxstart; kx < kxend; kx++, p0++)
878 d1 = p0->re;
879 d2 = p0->im;
881 eterm = tmp1[kx];
883 p0->re = d1*eterm;
884 p0->im = d2*eterm;
889 if (bEnerVir)
891 work->vir_lj[XX][XX] = 0.25*virxx;
892 work->vir_lj[YY][YY] = 0.25*viryy;
893 work->vir_lj[ZZ][ZZ] = 0.25*virzz;
894 work->vir_lj[XX][YY] = work->vir_lj[YY][XX] = 0.25*virxy;
895 work->vir_lj[XX][ZZ] = work->vir_lj[ZZ][XX] = 0.25*virxz;
896 work->vir_lj[YY][ZZ] = work->vir_lj[ZZ][YY] = 0.25*viryz;
898 /* This energy should be corrected for a charged system */
899 work->energy_lj = 0.5*energy;
901 /* Return the loop count */
902 return local_ndata[YY]*local_ndata[XX];