Split lines with many copyright years
[gromacs.git] / src / gromacs / ewald / pme_solve.cpp
blobaa6a69670eebb42984002987565758204e5dde5f
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
39 #include "gmxpre.h"
41 #include "pme_solve.h"
43 #include <cmath>
45 #include "gromacs/fft/parallel_3dfft.h"
46 #include "gromacs/math/units.h"
47 #include "gromacs/math/utilities.h"
48 #include "gromacs/math/vec.h"
49 #include "gromacs/simd/simd.h"
50 #include "gromacs/simd/simd_math.h"
51 #include "gromacs/utility/arrayref.h"
52 #include "gromacs/utility/exceptions.h"
53 #include "gromacs/utility/smalloc.h"
55 #include "pme_internal.h"
57 #if GMX_SIMD_HAVE_REAL
58 /* Turn on arbitrary width SIMD intrinsics for PME solve */
59 # define PME_SIMD_SOLVE
60 #endif
62 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
64 struct pme_solve_work_t
66 /* work data for solve_pme */
67 int nalloc;
68 real* mhx;
69 real* mhy;
70 real* mhz;
71 real* m2;
72 real* denom;
73 real* tmp1_alloc;
74 real* tmp1;
75 real* tmp2;
76 real* eterm;
77 real* m2inv;
79 real energy_q;
80 matrix vir_q;
81 real energy_lj;
82 matrix vir_lj;
85 #ifdef PME_SIMD_SOLVE
86 constexpr int c_simdWidth = GMX_SIMD_REAL_WIDTH;
87 #else
88 /* We can use any alignment > 0, so we use 4 */
89 constexpr int c_simdWidth = 4;
90 #endif
92 /* Returns the smallest number >= \p that is a multiple of \p factor, \p factor must be a power of 2 */
93 template<unsigned int factor>
94 static size_t roundUpToMultipleOfFactor(size_t number)
96 static_assert(factor > 0 && (factor & (factor - 1)) == 0,
97 "factor should be >0 and a power of 2");
99 /* We need to add a most factor-1 and because factor is a power of 2,
100 * we get the result by masking out the bits corresponding to factor-1.
102 return (number + factor - 1) & ~(factor - 1);
105 /* Allocate an aligned pointer for SIMD operations, including extra elements
106 * at the end for padding.
108 /* TODO: Replace this SIMD reallocator with a general, C++ solution */
109 static void reallocSimdAlignedAndPadded(real** ptr, int unpaddedNumElements)
111 sfree_aligned(*ptr);
112 snew_aligned(*ptr, roundUpToMultipleOfFactor<c_simdWidth>(unpaddedNumElements),
113 c_simdWidth * sizeof(real));
116 static void realloc_work(struct pme_solve_work_t* work, int nkx)
118 if (nkx > work->nalloc)
120 work->nalloc = nkx;
121 srenew(work->mhx, work->nalloc);
122 srenew(work->mhy, work->nalloc);
123 srenew(work->mhz, work->nalloc);
124 srenew(work->m2, work->nalloc);
125 reallocSimdAlignedAndPadded(&work->denom, work->nalloc);
126 reallocSimdAlignedAndPadded(&work->tmp1, work->nalloc);
127 reallocSimdAlignedAndPadded(&work->tmp2, work->nalloc);
128 reallocSimdAlignedAndPadded(&work->eterm, work->nalloc);
129 srenew(work->m2inv, work->nalloc);
131 /* Init all allocated elements of denom to 1 to avoid 1/0 exceptions
132 * of simd padded elements.
134 for (size_t i = 0; i < roundUpToMultipleOfFactor<c_simdWidth>(work->nalloc); i++)
136 work->denom[i] = 1;
141 void pme_init_all_work(struct pme_solve_work_t** work, int nthread, int nkx)
143 /* Use fft5d, order after FFT is y major, z, x minor */
145 snew(*work, nthread);
146 /* Allocate the work arrays thread local to optimize memory access */
147 #pragma omp parallel for num_threads(nthread) schedule(static)
148 for (int thread = 0; thread < nthread; thread++)
152 realloc_work(&((*work)[thread]), nkx);
154 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
158 static void free_work(struct pme_solve_work_t* work)
160 if (work)
162 sfree(work->mhx);
163 sfree(work->mhy);
164 sfree(work->mhz);
165 sfree(work->m2);
166 sfree_aligned(work->denom);
167 sfree_aligned(work->tmp1);
168 sfree_aligned(work->tmp2);
169 sfree_aligned(work->eterm);
170 sfree(work->m2inv);
174 void pme_free_all_work(struct pme_solve_work_t** work, int nthread)
176 if (*work)
178 for (int thread = 0; thread < nthread; thread++)
180 free_work(&(*work)[thread]);
183 sfree(*work);
184 *work = nullptr;
187 void get_pme_ener_vir_q(pme_solve_work_t* work, int nthread, PmeOutput* output)
189 GMX_ASSERT(output != nullptr, "Need valid output buffer");
190 /* This function sums output over threads and should therefore
191 * only be called after thread synchronization.
193 output->coulombEnergy_ = work[0].energy_q;
194 copy_mat(work[0].vir_q, output->coulombVirial_);
196 for (int thread = 1; thread < nthread; thread++)
198 output->coulombEnergy_ += work[thread].energy_q;
199 m_add(output->coulombVirial_, work[thread].vir_q, output->coulombVirial_);
203 void get_pme_ener_vir_lj(pme_solve_work_t* work, int nthread, PmeOutput* output)
205 GMX_ASSERT(output != nullptr, "Need valid output buffer");
206 /* This function sums output over threads and should therefore
207 * only be called after thread synchronization.
209 output->lennardJonesEnergy_ = work[0].energy_lj;
210 copy_mat(work[0].vir_lj, output->lennardJonesVirial_);
212 for (int thread = 1; thread < nthread; thread++)
214 output->lennardJonesEnergy_ += work[thread].energy_lj;
215 m_add(output->lennardJonesVirial_, work[thread].vir_lj, output->lennardJonesVirial_);
219 #if defined PME_SIMD_SOLVE
220 /* Calculate exponentials through SIMD */
221 inline static void calc_exponentials_q(int /*unused*/,
222 int /*unused*/,
223 real f,
224 ArrayRef<const SimdReal> d_aligned,
225 ArrayRef<const SimdReal> r_aligned,
226 ArrayRef<SimdReal> e_aligned)
229 SimdReal f_simd(f);
230 SimdReal tmp_d1, tmp_r, tmp_e;
232 /* We only need to calculate from start. But since start is 0 or 1
233 * and we want to use aligned loads/stores, we always start from 0.
235 GMX_ASSERT(d_aligned.size() == r_aligned.size(), "d and r must have same size");
236 GMX_ASSERT(d_aligned.size() == e_aligned.size(), "d and e must have same size");
237 for (size_t kx = 0; kx != d_aligned.size(); ++kx)
239 tmp_d1 = d_aligned[kx];
240 tmp_r = r_aligned[kx];
241 tmp_r = gmx::exp(tmp_r);
242 tmp_e = f_simd / tmp_d1;
243 tmp_e = tmp_e * tmp_r;
244 e_aligned[kx] = tmp_e;
248 #else
249 inline static void
250 calc_exponentials_q(int start, int end, real f, ArrayRef<real> d, ArrayRef<real> r, ArrayRef<real> e)
252 GMX_ASSERT(d.size() == r.size(), "d and r must have same size");
253 GMX_ASSERT(d.size() == e.size(), "d and e must have same size");
254 int kx;
255 for (kx = start; kx < end; kx++)
257 d[kx] = 1.0 / d[kx];
259 for (kx = start; kx < end; kx++)
261 r[kx] = std::exp(r[kx]);
263 for (kx = start; kx < end; kx++)
265 e[kx] = f * r[kx] * d[kx];
268 #endif
270 #if defined PME_SIMD_SOLVE
271 /* Calculate exponentials through SIMD */
272 inline static void calc_exponentials_lj(int /*unused*/,
273 int /*unused*/,
274 ArrayRef<SimdReal> r_aligned,
275 ArrayRef<SimdReal> factor_aligned,
276 ArrayRef<SimdReal> d_aligned)
278 SimdReal tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
279 const SimdReal sqr_PI = sqrt(SimdReal(M_PI));
281 GMX_ASSERT(d_aligned.size() == r_aligned.size(), "d and r must have same size");
282 GMX_ASSERT(d_aligned.size() == factor_aligned.size(), "d and factor must have same size");
283 for (size_t kx = 0; kx != d_aligned.size(); ++kx)
285 /* We only need to calculate from start. But since start is 0 or 1
286 * and we want to use aligned loads/stores, we always start from 0.
288 tmp_d = d_aligned[kx];
289 d_inv = SimdReal(1.0) / tmp_d;
290 d_aligned[kx] = d_inv;
291 tmp_r = r_aligned[kx];
292 tmp_r = gmx::exp(tmp_r);
293 r_aligned[kx] = tmp_r;
294 tmp_mk = factor_aligned[kx];
295 tmp_fac = sqr_PI * tmp_mk * erfc(tmp_mk);
296 factor_aligned[kx] = tmp_fac;
299 #else
300 inline static void
301 calc_exponentials_lj(int start, int end, ArrayRef<real> r, ArrayRef<real> tmp2, ArrayRef<real> d)
303 int kx;
304 real mk;
305 GMX_ASSERT(d.size() == r.size(), "d and r must have same size");
306 GMX_ASSERT(d.size() == tmp2.size(), "d and tmp2 must have same size");
307 for (kx = start; kx < end; kx++)
309 d[kx] = 1.0 / d[kx];
312 for (kx = start; kx < end; kx++)
314 r[kx] = std::exp(r[kx]);
317 for (kx = start; kx < end; kx++)
319 mk = tmp2[kx];
320 tmp2[kx] = sqrt(M_PI) * mk * std::erfc(mk);
323 #endif
325 #if defined PME_SIMD_SOLVE
326 using PME_T = SimdReal;
327 #else
328 using PME_T = real;
329 #endif
331 int solve_pme_yzx(const gmx_pme_t* pme, t_complex* grid, real vol, gmx_bool bEnerVir, int nthread, int thread)
333 /* do recip sum over local cells in grid */
334 /* y major, z middle, x minor or continuous */
335 t_complex* p0;
336 int kx, ky, kz, maxkx, maxky;
337 int nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
338 real mx, my, mz;
339 real ewaldcoeff = pme->ewaldcoeff_q;
340 real factor = M_PI * M_PI / (ewaldcoeff * ewaldcoeff);
341 real ets2, struct2, vfactor, ets2vf;
342 real d1, d2, energy = 0;
343 real by, bz;
344 real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
345 real rxx, ryx, ryy, rzx, rzy, rzz;
346 struct pme_solve_work_t* work;
347 real * mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
348 real mhxk, mhyk, mhzk, m2k;
349 real corner_fac;
350 ivec complex_order;
351 ivec local_ndata, local_offset, local_size;
352 real elfac;
354 elfac = ONE_4PI_EPS0 / pme->epsilon_r;
356 nx = pme->nkx;
357 ny = pme->nky;
358 nz = pme->nkz;
360 /* Dimensions should be identical for A/B grid, so we just use A here */
361 gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_QA], complex_order, local_ndata,
362 local_offset, local_size);
364 rxx = pme->recipbox[XX][XX];
365 ryx = pme->recipbox[YY][XX];
366 ryy = pme->recipbox[YY][YY];
367 rzx = pme->recipbox[ZZ][XX];
368 rzy = pme->recipbox[ZZ][YY];
369 rzz = pme->recipbox[ZZ][ZZ];
371 GMX_ASSERT(rxx != 0.0, "Someone broke the reciprocal box again");
373 maxkx = (nx + 1) / 2;
374 maxky = (ny + 1) / 2;
376 work = &pme->solve_work[thread];
377 mhx = work->mhx;
378 mhy = work->mhy;
379 mhz = work->mhz;
380 m2 = work->m2;
381 denom = work->denom;
382 tmp1 = work->tmp1;
383 eterm = work->eterm;
384 m2inv = work->m2inv;
386 iyz0 = local_ndata[YY] * local_ndata[ZZ] * thread / nthread;
387 iyz1 = local_ndata[YY] * local_ndata[ZZ] * (thread + 1) / nthread;
389 for (iyz = iyz0; iyz < iyz1; iyz++)
391 iy = iyz / local_ndata[ZZ];
392 iz = iyz - iy * local_ndata[ZZ];
394 ky = iy + local_offset[YY];
396 if (ky < maxky)
398 my = ky;
400 else
402 my = (ky - ny);
405 by = M_PI * vol * pme->bsp_mod[YY][ky];
407 kz = iz + local_offset[ZZ];
409 mz = kz;
411 bz = pme->bsp_mod[ZZ][kz];
413 /* 0.5 correction for corner points */
414 corner_fac = 1;
415 if (kz == 0 || kz == (nz + 1) / 2)
417 corner_fac = 0.5;
420 p0 = grid + iy * local_size[ZZ] * local_size[XX] + iz * local_size[XX];
422 /* We should skip the k-space point (0,0,0) */
423 /* Note that since here x is the minor index, local_offset[XX]=0 */
424 if (local_offset[XX] > 0 || ky > 0 || kz > 0)
426 kxstart = local_offset[XX];
428 else
430 kxstart = local_offset[XX] + 1;
431 p0++;
433 kxend = local_offset[XX] + local_ndata[XX];
435 if (bEnerVir)
437 /* More expensive inner loop, especially because of the storage
438 * of the mh elements in array's.
439 * Because x is the minor grid index, all mh elements
440 * depend on kx for triclinic unit cells.
443 /* Two explicit loops to avoid a conditional inside the loop */
444 for (kx = kxstart; kx < maxkx; kx++)
446 mx = kx;
448 mhxk = mx * rxx;
449 mhyk = mx * ryx + my * ryy;
450 mhzk = mx * rzx + my * rzy + mz * rzz;
451 m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
452 mhx[kx] = mhxk;
453 mhy[kx] = mhyk;
454 mhz[kx] = mhzk;
455 m2[kx] = m2k;
456 denom[kx] = m2k * bz * by * pme->bsp_mod[XX][kx];
457 tmp1[kx] = -factor * m2k;
460 for (kx = maxkx; kx < kxend; kx++)
462 mx = (kx - nx);
464 mhxk = mx * rxx;
465 mhyk = mx * ryx + my * ryy;
466 mhzk = mx * rzx + my * rzy + mz * rzz;
467 m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
468 mhx[kx] = mhxk;
469 mhy[kx] = mhyk;
470 mhz[kx] = mhzk;
471 m2[kx] = m2k;
472 denom[kx] = m2k * bz * by * pme->bsp_mod[XX][kx];
473 tmp1[kx] = -factor * m2k;
476 for (kx = kxstart; kx < kxend; kx++)
478 m2inv[kx] = 1.0 / m2[kx];
481 calc_exponentials_q(
482 kxstart, kxend, elfac,
483 ArrayRef<PME_T>(denom, denom + roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
484 ArrayRef<PME_T>(tmp1, tmp1 + roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
485 ArrayRef<PME_T>(eterm, eterm + roundUpToMultipleOfFactor<c_simdWidth>(kxend)));
487 for (kx = kxstart; kx < kxend; kx++, p0++)
489 d1 = p0->re;
490 d2 = p0->im;
492 p0->re = d1 * eterm[kx];
493 p0->im = d2 * eterm[kx];
495 struct2 = 2.0 * (d1 * d1 + d2 * d2);
497 tmp1[kx] = eterm[kx] * struct2;
500 for (kx = kxstart; kx < kxend; kx++)
502 ets2 = corner_fac * tmp1[kx];
503 vfactor = (factor * m2[kx] + 1.0) * 2.0 * m2inv[kx];
504 energy += ets2;
506 ets2vf = ets2 * vfactor;
507 virxx += ets2vf * mhx[kx] * mhx[kx] - ets2;
508 virxy += ets2vf * mhx[kx] * mhy[kx];
509 virxz += ets2vf * mhx[kx] * mhz[kx];
510 viryy += ets2vf * mhy[kx] * mhy[kx] - ets2;
511 viryz += ets2vf * mhy[kx] * mhz[kx];
512 virzz += ets2vf * mhz[kx] * mhz[kx] - ets2;
515 else
517 /* We don't need to calculate the energy and the virial.
518 * In this case the triclinic overhead is small.
521 /* Two explicit loops to avoid a conditional inside the loop */
523 for (kx = kxstart; kx < maxkx; kx++)
525 mx = kx;
527 mhxk = mx * rxx;
528 mhyk = mx * ryx + my * ryy;
529 mhzk = mx * rzx + my * rzy + mz * rzz;
530 m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
531 denom[kx] = m2k * bz * by * pme->bsp_mod[XX][kx];
532 tmp1[kx] = -factor * m2k;
535 for (kx = maxkx; kx < kxend; kx++)
537 mx = (kx - nx);
539 mhxk = mx * rxx;
540 mhyk = mx * ryx + my * ryy;
541 mhzk = mx * rzx + my * rzy + mz * rzz;
542 m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
543 denom[kx] = m2k * bz * by * pme->bsp_mod[XX][kx];
544 tmp1[kx] = -factor * m2k;
547 calc_exponentials_q(
548 kxstart, kxend, elfac,
549 ArrayRef<PME_T>(denom, denom + roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
550 ArrayRef<PME_T>(tmp1, tmp1 + roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
551 ArrayRef<PME_T>(eterm, eterm + roundUpToMultipleOfFactor<c_simdWidth>(kxend)));
554 for (kx = kxstart; kx < kxend; kx++, p0++)
556 d1 = p0->re;
557 d2 = p0->im;
559 p0->re = d1 * eterm[kx];
560 p0->im = d2 * eterm[kx];
565 if (bEnerVir)
567 /* Update virial with local values.
568 * The virial is symmetric by definition.
569 * this virial seems ok for isotropic scaling, but I'm
570 * experiencing problems on semiisotropic membranes.
571 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
573 work->vir_q[XX][XX] = 0.25 * virxx;
574 work->vir_q[YY][YY] = 0.25 * viryy;
575 work->vir_q[ZZ][ZZ] = 0.25 * virzz;
576 work->vir_q[XX][YY] = work->vir_q[YY][XX] = 0.25 * virxy;
577 work->vir_q[XX][ZZ] = work->vir_q[ZZ][XX] = 0.25 * virxz;
578 work->vir_q[YY][ZZ] = work->vir_q[ZZ][YY] = 0.25 * viryz;
580 /* This energy should be corrected for a charged system */
581 work->energy_q = 0.5 * energy;
584 /* Return the loop count */
585 return local_ndata[YY] * local_ndata[XX];
588 int solve_pme_lj_yzx(const gmx_pme_t* pme, t_complex** grid, gmx_bool bLB, real vol, gmx_bool bEnerVir, int nthread, int thread)
590 /* do recip sum over local cells in grid */
591 /* y major, z middle, x minor or continuous */
592 int ig, gcount;
593 int kx, ky, kz, maxkx, maxky;
594 int nx, ny, nz, iy, iyz0, iyz1, iyz, iz, kxstart, kxend;
595 real mx, my, mz;
596 real ewaldcoeff = pme->ewaldcoeff_lj;
597 real factor = M_PI * M_PI / (ewaldcoeff * ewaldcoeff);
598 real ets2, ets2vf;
599 real eterm, vterm, d1, d2, energy = 0;
600 real by, bz;
601 real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
602 real rxx, ryx, ryy, rzx, rzy, rzz;
603 real * mhx, *mhy, *mhz, *m2, *denom, *tmp1, *tmp2;
604 real mhxk, mhyk, mhzk, m2k;
605 struct pme_solve_work_t* work;
606 real corner_fac;
607 ivec complex_order;
608 ivec local_ndata, local_offset, local_size;
609 nx = pme->nkx;
610 ny = pme->nky;
611 nz = pme->nkz;
613 /* Dimensions should be identical for A/B grid, so we just use A here */
614 gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_C6A], complex_order, local_ndata,
615 local_offset, local_size);
616 rxx = pme->recipbox[XX][XX];
617 ryx = pme->recipbox[YY][XX];
618 ryy = pme->recipbox[YY][YY];
619 rzx = pme->recipbox[ZZ][XX];
620 rzy = pme->recipbox[ZZ][YY];
621 rzz = pme->recipbox[ZZ][ZZ];
623 maxkx = (nx + 1) / 2;
624 maxky = (ny + 1) / 2;
626 work = &pme->solve_work[thread];
627 mhx = work->mhx;
628 mhy = work->mhy;
629 mhz = work->mhz;
630 m2 = work->m2;
631 denom = work->denom;
632 tmp1 = work->tmp1;
633 tmp2 = work->tmp2;
635 iyz0 = local_ndata[YY] * local_ndata[ZZ] * thread / nthread;
636 iyz1 = local_ndata[YY] * local_ndata[ZZ] * (thread + 1) / nthread;
638 for (iyz = iyz0; iyz < iyz1; iyz++)
640 iy = iyz / local_ndata[ZZ];
641 iz = iyz - iy * local_ndata[ZZ];
643 ky = iy + local_offset[YY];
645 if (ky < maxky)
647 my = ky;
649 else
651 my = (ky - ny);
654 by = 3.0 * vol * pme->bsp_mod[YY][ky] / (M_PI * sqrt(M_PI) * ewaldcoeff * ewaldcoeff * ewaldcoeff);
656 kz = iz + local_offset[ZZ];
658 mz = kz;
660 bz = pme->bsp_mod[ZZ][kz];
662 /* 0.5 correction for corner points */
663 corner_fac = 1;
664 if (kz == 0 || kz == (nz + 1) / 2)
666 corner_fac = 0.5;
669 kxstart = local_offset[XX];
670 kxend = local_offset[XX] + local_ndata[XX];
671 if (bEnerVir)
673 /* More expensive inner loop, especially because of the
674 * storage of the mh elements in array's. Because x is the
675 * minor grid index, all mh elements depend on kx for
676 * triclinic unit cells.
679 /* Two explicit loops to avoid a conditional inside the loop */
680 for (kx = kxstart; kx < maxkx; kx++)
682 mx = kx;
684 mhxk = mx * rxx;
685 mhyk = mx * ryx + my * ryy;
686 mhzk = mx * rzx + my * rzy + mz * rzz;
687 m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
688 mhx[kx] = mhxk;
689 mhy[kx] = mhyk;
690 mhz[kx] = mhzk;
691 m2[kx] = m2k;
692 denom[kx] = bz * by * pme->bsp_mod[XX][kx];
693 tmp1[kx] = -factor * m2k;
694 tmp2[kx] = sqrt(factor * m2k);
697 for (kx = maxkx; kx < kxend; kx++)
699 mx = (kx - nx);
701 mhxk = mx * rxx;
702 mhyk = mx * ryx + my * ryy;
703 mhzk = mx * rzx + my * rzy + mz * rzz;
704 m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
705 mhx[kx] = mhxk;
706 mhy[kx] = mhyk;
707 mhz[kx] = mhzk;
708 m2[kx] = m2k;
709 denom[kx] = bz * by * pme->bsp_mod[XX][kx];
710 tmp1[kx] = -factor * m2k;
711 tmp2[kx] = sqrt(factor * m2k);
713 /* Clear padding elements to avoid (harmless) fp exceptions */
714 const int kxendSimd = roundUpToMultipleOfFactor<c_simdWidth>(kxend);
715 for (; kx < kxendSimd; kx++)
717 tmp1[kx] = 0;
718 tmp2[kx] = 0;
721 calc_exponentials_lj(
722 kxstart, kxend,
723 ArrayRef<PME_T>(tmp1, tmp1 + roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
724 ArrayRef<PME_T>(tmp2, tmp2 + roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
725 ArrayRef<PME_T>(denom, denom + roundUpToMultipleOfFactor<c_simdWidth>(kxend)));
727 for (kx = kxstart; kx < kxend; kx++)
729 m2k = factor * m2[kx];
730 eterm = -((1.0 - 2.0 * m2k) * tmp1[kx] + 2.0 * m2k * tmp2[kx]);
731 vterm = 3.0 * (-tmp1[kx] + tmp2[kx]);
732 tmp1[kx] = eterm * denom[kx];
733 tmp2[kx] = vterm * denom[kx];
736 if (!bLB)
738 t_complex* p0;
739 real struct2;
741 p0 = grid[0] + iy * local_size[ZZ] * local_size[XX] + iz * local_size[XX];
742 for (kx = kxstart; kx < kxend; kx++, p0++)
744 d1 = p0->re;
745 d2 = p0->im;
747 eterm = tmp1[kx];
748 vterm = tmp2[kx];
749 p0->re = d1 * eterm;
750 p0->im = d2 * eterm;
752 struct2 = 2.0 * (d1 * d1 + d2 * d2);
754 tmp1[kx] = eterm * struct2;
755 tmp2[kx] = vterm * struct2;
758 else
760 real* struct2 = denom;
761 real str2;
763 for (kx = kxstart; kx < kxend; kx++)
765 struct2[kx] = 0.0;
767 /* Due to symmetry we only need to calculate 4 of the 7 terms */
768 for (ig = 0; ig <= 3; ++ig)
770 t_complex *p0, *p1;
771 real scale;
773 p0 = grid[ig] + iy * local_size[ZZ] * local_size[XX] + iz * local_size[XX];
774 p1 = grid[6 - ig] + iy * local_size[ZZ] * local_size[XX] + iz * local_size[XX];
775 scale = 2.0 * lb_scale_factor_symm[ig];
776 for (kx = kxstart; kx < kxend; ++kx, ++p0, ++p1)
778 struct2[kx] += scale * (p0->re * p1->re + p0->im * p1->im);
781 for (ig = 0; ig <= 6; ++ig)
783 t_complex* p0;
785 p0 = grid[ig] + iy * local_size[ZZ] * local_size[XX] + iz * local_size[XX];
786 for (kx = kxstart; kx < kxend; kx++, p0++)
788 d1 = p0->re;
789 d2 = p0->im;
791 eterm = tmp1[kx];
792 p0->re = d1 * eterm;
793 p0->im = d2 * eterm;
796 for (kx = kxstart; kx < kxend; kx++)
798 eterm = tmp1[kx];
799 vterm = tmp2[kx];
800 str2 = struct2[kx];
801 tmp1[kx] = eterm * str2;
802 tmp2[kx] = vterm * str2;
806 for (kx = kxstart; kx < kxend; kx++)
808 ets2 = corner_fac * tmp1[kx];
809 vterm = 2.0 * factor * tmp2[kx];
810 energy += ets2;
811 ets2vf = corner_fac * vterm;
812 virxx += ets2vf * mhx[kx] * mhx[kx] - ets2;
813 virxy += ets2vf * mhx[kx] * mhy[kx];
814 virxz += ets2vf * mhx[kx] * mhz[kx];
815 viryy += ets2vf * mhy[kx] * mhy[kx] - ets2;
816 viryz += ets2vf * mhy[kx] * mhz[kx];
817 virzz += ets2vf * mhz[kx] * mhz[kx] - ets2;
820 else
822 /* We don't need to calculate the energy and the virial.
823 * In this case the triclinic overhead is small.
826 /* Two explicit loops to avoid a conditional inside the loop */
828 for (kx = kxstart; kx < maxkx; kx++)
830 mx = kx;
832 mhxk = mx * rxx;
833 mhyk = mx * ryx + my * ryy;
834 mhzk = mx * rzx + my * rzy + mz * rzz;
835 m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
836 m2[kx] = m2k;
837 denom[kx] = bz * by * pme->bsp_mod[XX][kx];
838 tmp1[kx] = -factor * m2k;
839 tmp2[kx] = sqrt(factor * m2k);
842 for (kx = maxkx; kx < kxend; kx++)
844 mx = (kx - nx);
846 mhxk = mx * rxx;
847 mhyk = mx * ryx + my * ryy;
848 mhzk = mx * rzx + my * rzy + mz * rzz;
849 m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
850 m2[kx] = m2k;
851 denom[kx] = bz * by * pme->bsp_mod[XX][kx];
852 tmp1[kx] = -factor * m2k;
853 tmp2[kx] = sqrt(factor * m2k);
855 /* Clear padding elements to avoid (harmless) fp exceptions */
856 const int kxendSimd = roundUpToMultipleOfFactor<c_simdWidth>(kxend);
857 for (; kx < kxendSimd; kx++)
859 tmp1[kx] = 0;
860 tmp2[kx] = 0;
863 calc_exponentials_lj(
864 kxstart, kxend,
865 ArrayRef<PME_T>(tmp1, tmp1 + roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
866 ArrayRef<PME_T>(tmp2, tmp2 + roundUpToMultipleOfFactor<c_simdWidth>(kxend)),
867 ArrayRef<PME_T>(denom, denom + roundUpToMultipleOfFactor<c_simdWidth>(kxend)));
869 for (kx = kxstart; kx < kxend; kx++)
871 m2k = factor * m2[kx];
872 eterm = -((1.0 - 2.0 * m2k) * tmp1[kx] + 2.0 * m2k * tmp2[kx]);
873 tmp1[kx] = eterm * denom[kx];
875 gcount = (bLB ? 7 : 1);
876 for (ig = 0; ig < gcount; ++ig)
878 t_complex* p0;
880 p0 = grid[ig] + iy * local_size[ZZ] * local_size[XX] + iz * local_size[XX];
881 for (kx = kxstart; kx < kxend; kx++, p0++)
883 d1 = p0->re;
884 d2 = p0->im;
886 eterm = tmp1[kx];
888 p0->re = d1 * eterm;
889 p0->im = d2 * eterm;
894 if (bEnerVir)
896 work->vir_lj[XX][XX] = 0.25 * virxx;
897 work->vir_lj[YY][YY] = 0.25 * viryy;
898 work->vir_lj[ZZ][ZZ] = 0.25 * virzz;
899 work->vir_lj[XX][YY] = work->vir_lj[YY][XX] = 0.25 * virxy;
900 work->vir_lj[XX][ZZ] = work->vir_lj[ZZ][XX] = 0.25 * virxz;
901 work->vir_lj[YY][ZZ] = work->vir_lj[ZZ][YY] = 0.25 * viryz;
903 /* This energy should be corrected for a charged system */
904 work->energy_lj = 0.5 * energy;
906 /* Return the loop count */
907 return local_ndata[YY] * local_ndata[XX];