2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
40 #include "pme_solve.h"
44 #include "gromacs/fft/parallel_3dfft.h"
45 #include "gromacs/math/units.h"
46 #include "gromacs/math/utilities.h"
47 #include "gromacs/math/vec.h"
48 #include "gromacs/simd/simd.h"
49 #include "gromacs/simd/simd_math.h"
50 #include "gromacs/utility/arrayref.h"
51 #include "gromacs/utility/exceptions.h"
52 #include "gromacs/utility/smalloc.h"
54 #include "pme_internal.h"
56 #if GMX_SIMD_HAVE_REAL
57 /* Turn on arbitrary width SIMD intrinsics for PME solve */
58 # define PME_SIMD_SOLVE
61 using namespace gmx
; // TODO: Remove when this file is moved into gmx namespace
63 struct pme_solve_work_t
65 /* work data for solve_pme */
85 constexpr int c_simdWidth
= GMX_SIMD_REAL_WIDTH
;
87 /* We can use any alignment > 0, so we use 4 */
88 constexpr int c_simdWidth
= 4;
91 /* Returns the smallest number >= \p that is a multiple of \p factor, \p factor must be a power of 2 */
92 template <unsigned int factor
>
93 static size_t roundUpToMultipleOfFactor(size_t number
)
95 static_assert(factor
> 0 && (factor
& (factor
- 1)) == 0, "factor should be >0 and a power of 2");
97 /* We need to add a most factor-1 and because factor is a power of 2,
98 * we get the result by masking out the bits corresponding to factor-1.
100 return (number
+ factor
- 1) & ~(factor
- 1);
103 /* Allocate an aligned pointer for SIMD operations, including extra elements
104 * at the end for padding.
106 /* TODO: Replace this SIMD reallocator with a general, C++ solution */
107 static void reallocSimdAlignedAndPadded(real
**ptr
, int unpaddedNumElements
)
110 snew_aligned(*ptr
, roundUpToMultipleOfFactor
<c_simdWidth
>(unpaddedNumElements
), c_simdWidth
*sizeof(real
));
113 static void realloc_work(struct pme_solve_work_t
*work
, int nkx
)
115 if (nkx
> work
->nalloc
)
118 srenew(work
->mhx
, work
->nalloc
);
119 srenew(work
->mhy
, work
->nalloc
);
120 srenew(work
->mhz
, work
->nalloc
);
121 srenew(work
->m2
, work
->nalloc
);
122 reallocSimdAlignedAndPadded(&work
->denom
, work
->nalloc
);
123 reallocSimdAlignedAndPadded(&work
->tmp1
, work
->nalloc
);
124 reallocSimdAlignedAndPadded(&work
->tmp2
, work
->nalloc
);
125 reallocSimdAlignedAndPadded(&work
->eterm
, work
->nalloc
);
126 srenew(work
->m2inv
, work
->nalloc
);
128 /* Init all allocated elements of denom to 1 to avoid 1/0 exceptions
129 * of simd padded elements.
131 for (size_t i
= 0; i
< roundUpToMultipleOfFactor
<c_simdWidth
>(work
->nalloc
); i
++)
138 void pme_init_all_work(struct pme_solve_work_t
**work
, int nthread
, int nkx
)
140 /* Use fft5d, order after FFT is y major, z, x minor */
142 snew(*work
, nthread
);
143 /* Allocate the work arrays thread local to optimize memory access */
144 #pragma omp parallel for num_threads(nthread) schedule(static)
145 for (int thread
= 0; thread
< nthread
; thread
++)
149 realloc_work(&((*work
)[thread
]), nkx
);
151 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
155 static void free_work(struct pme_solve_work_t
*work
)
163 sfree_aligned(work
->denom
);
164 sfree_aligned(work
->tmp1
);
165 sfree_aligned(work
->tmp2
);
166 sfree_aligned(work
->eterm
);
171 void pme_free_all_work(struct pme_solve_work_t
**work
, int nthread
)
175 for (int thread
= 0; thread
< nthread
; thread
++)
177 free_work(&(*work
)[thread
]);
184 void get_pme_ener_vir_q(pme_solve_work_t
*work
, int nthread
, PmeOutput
*output
)
186 GMX_ASSERT(output
!= nullptr, "Need valid output buffer");
187 /* This function sums output over threads and should therefore
188 * only be called after thread synchronization.
190 output
->coulombEnergy_
= work
[0].energy_q
;
191 copy_mat(work
[0].vir_q
, output
->coulombVirial_
);
193 for (int thread
= 1; thread
< nthread
; thread
++)
195 output
->coulombEnergy_
+= work
[thread
].energy_q
;
196 m_add(output
->coulombVirial_
, work
[thread
].vir_q
, output
->coulombVirial_
);
200 void get_pme_ener_vir_lj(pme_solve_work_t
*work
, int nthread
, PmeOutput
*output
)
202 GMX_ASSERT(output
!= nullptr, "Need valid output buffer");
203 /* This function sums output over threads and should therefore
204 * only be called after thread synchronization.
206 output
->lennardJonesEnergy_
= work
[0].energy_lj
;
207 copy_mat(work
[0].vir_lj
, output
->lennardJonesVirial_
);
209 for (int thread
= 1; thread
< nthread
; thread
++)
211 output
->lennardJonesEnergy_
+= work
[thread
].energy_lj
;
212 m_add(output
->lennardJonesVirial_
, work
[thread
].vir_lj
, output
->lennardJonesVirial_
);
216 #if defined PME_SIMD_SOLVE
217 /* Calculate exponentials through SIMD */
218 inline static void calc_exponentials_q(int /*unused*/, int /*unused*/, real f
, ArrayRef
<const SimdReal
> d_aligned
, ArrayRef
<const SimdReal
> r_aligned
, ArrayRef
<SimdReal
> e_aligned
)
222 SimdReal tmp_d1
, tmp_r
, tmp_e
;
224 /* We only need to calculate from start. But since start is 0 or 1
225 * and we want to use aligned loads/stores, we always start from 0.
227 GMX_ASSERT(d_aligned
.size() == r_aligned
.size(), "d and r must have same size");
228 GMX_ASSERT(d_aligned
.size() == e_aligned
.size(), "d and e must have same size");
229 for (size_t kx
= 0; kx
!= d_aligned
.size(); ++kx
)
231 tmp_d1
= d_aligned
[kx
];
232 tmp_r
= r_aligned
[kx
];
233 tmp_r
= gmx::exp(tmp_r
);
234 tmp_e
= f_simd
/ tmp_d1
;
235 tmp_e
= tmp_e
* tmp_r
;
236 e_aligned
[kx
] = tmp_e
;
241 inline static void calc_exponentials_q(int start
, int end
, real f
, ArrayRef
<real
> d
, ArrayRef
<real
> r
, ArrayRef
<real
> e
)
243 GMX_ASSERT(d
.size() == r
.size(), "d and r must have same size");
244 GMX_ASSERT(d
.size() == e
.size(), "d and e must have same size");
246 for (kx
= start
; kx
< end
; kx
++)
250 for (kx
= start
; kx
< end
; kx
++)
252 r
[kx
] = std::exp(r
[kx
]);
254 for (kx
= start
; kx
< end
; kx
++)
256 e
[kx
] = f
*r
[kx
]*d
[kx
];
261 #if defined PME_SIMD_SOLVE
262 /* Calculate exponentials through SIMD */
263 inline static void calc_exponentials_lj(int /*unused*/, int /*unused*/, ArrayRef
<SimdReal
> r_aligned
, ArrayRef
<SimdReal
> factor_aligned
, ArrayRef
<SimdReal
> d_aligned
)
265 SimdReal tmp_r
, tmp_d
, tmp_fac
, d_inv
, tmp_mk
;
266 const SimdReal sqr_PI
= sqrt(SimdReal(M_PI
));
268 GMX_ASSERT(d_aligned
.size() == r_aligned
.size(), "d and r must have same size");
269 GMX_ASSERT(d_aligned
.size() == factor_aligned
.size(), "d and factor must have same size");
270 for (size_t kx
= 0; kx
!= d_aligned
.size(); ++kx
)
272 /* We only need to calculate from start. But since start is 0 or 1
273 * and we want to use aligned loads/stores, we always start from 0.
275 tmp_d
= d_aligned
[kx
];
276 d_inv
= SimdReal(1.0) / tmp_d
;
277 d_aligned
[kx
] = d_inv
;
278 tmp_r
= r_aligned
[kx
];
279 tmp_r
= gmx::exp(tmp_r
);
280 r_aligned
[kx
] = tmp_r
;
281 tmp_mk
= factor_aligned
[kx
];
282 tmp_fac
= sqr_PI
* tmp_mk
* erfc(tmp_mk
);
283 factor_aligned
[kx
] = tmp_fac
;
287 inline static void calc_exponentials_lj(int start
, int end
, ArrayRef
<real
> r
, ArrayRef
<real
> tmp2
, ArrayRef
<real
> d
)
291 GMX_ASSERT(d
.size() == r
.size(), "d and r must have same size");
292 GMX_ASSERT(d
.size() == tmp2
.size(), "d and tmp2 must have same size");
293 for (kx
= start
; kx
< end
; kx
++)
298 for (kx
= start
; kx
< end
; kx
++)
300 r
[kx
] = std::exp(r
[kx
]);
303 for (kx
= start
; kx
< end
; kx
++)
306 tmp2
[kx
] = sqrt(M_PI
)*mk
*std::erfc(mk
);
311 #if defined PME_SIMD_SOLVE
312 using PME_T
= SimdReal
;
317 int solve_pme_yzx(const gmx_pme_t
*pme
, t_complex
*grid
, real vol
,
319 int nthread
, int thread
)
321 /* do recip sum over local cells in grid */
322 /* y major, z middle, x minor or continuous */
324 int kx
, ky
, kz
, maxkx
, maxky
;
325 int nx
, ny
, nz
, iyz0
, iyz1
, iyz
, iy
, iz
, kxstart
, kxend
;
327 real ewaldcoeff
= pme
->ewaldcoeff_q
;
328 real factor
= M_PI
*M_PI
/(ewaldcoeff
*ewaldcoeff
);
329 real ets2
, struct2
, vfactor
, ets2vf
;
330 real d1
, d2
, energy
= 0;
332 real virxx
= 0, virxy
= 0, virxz
= 0, viryy
= 0, viryz
= 0, virzz
= 0;
333 real rxx
, ryx
, ryy
, rzx
, rzy
, rzz
;
334 struct pme_solve_work_t
*work
;
335 real
*mhx
, *mhy
, *mhz
, *m2
, *denom
, *tmp1
, *eterm
, *m2inv
;
336 real mhxk
, mhyk
, mhzk
, m2k
;
339 ivec local_ndata
, local_offset
, local_size
;
342 elfac
= ONE_4PI_EPS0
/pme
->epsilon_r
;
348 /* Dimensions should be identical for A/B grid, so we just use A here */
349 gmx_parallel_3dfft_complex_limits(pme
->pfft_setup
[PME_GRID_QA
],
355 rxx
= pme
->recipbox
[XX
][XX
];
356 ryx
= pme
->recipbox
[YY
][XX
];
357 ryy
= pme
->recipbox
[YY
][YY
];
358 rzx
= pme
->recipbox
[ZZ
][XX
];
359 rzy
= pme
->recipbox
[ZZ
][YY
];
360 rzz
= pme
->recipbox
[ZZ
][ZZ
];
362 GMX_ASSERT(rxx
!= 0.0, "Someone broke the reciprocal box again");
367 work
= &pme
->solve_work
[thread
];
377 iyz0
= local_ndata
[YY
]*local_ndata
[ZZ
]* thread
/nthread
;
378 iyz1
= local_ndata
[YY
]*local_ndata
[ZZ
]*(thread
+1)/nthread
;
380 for (iyz
= iyz0
; iyz
< iyz1
; iyz
++)
382 iy
= iyz
/local_ndata
[ZZ
];
383 iz
= iyz
- iy
*local_ndata
[ZZ
];
385 ky
= iy
+ local_offset
[YY
];
396 by
= M_PI
*vol
*pme
->bsp_mod
[YY
][ky
];
398 kz
= iz
+ local_offset
[ZZ
];
402 bz
= pme
->bsp_mod
[ZZ
][kz
];
404 /* 0.5 correction for corner points */
406 if (kz
== 0 || kz
== (nz
+1)/2)
411 p0
= grid
+ iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
413 /* We should skip the k-space point (0,0,0) */
414 /* Note that since here x is the minor index, local_offset[XX]=0 */
415 if (local_offset
[XX
] > 0 || ky
> 0 || kz
> 0)
417 kxstart
= local_offset
[XX
];
421 kxstart
= local_offset
[XX
] + 1;
424 kxend
= local_offset
[XX
] + local_ndata
[XX
];
428 /* More expensive inner loop, especially because of the storage
429 * of the mh elements in array's.
430 * Because x is the minor grid index, all mh elements
431 * depend on kx for triclinic unit cells.
434 /* Two explicit loops to avoid a conditional inside the loop */
435 for (kx
= kxstart
; kx
< maxkx
; kx
++)
440 mhyk
= mx
* ryx
+ my
* ryy
;
441 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
442 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
447 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
448 tmp1
[kx
] = -factor
*m2k
;
451 for (kx
= maxkx
; kx
< kxend
; kx
++)
456 mhyk
= mx
* ryx
+ my
* ryy
;
457 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
458 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
463 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
464 tmp1
[kx
] = -factor
*m2k
;
467 for (kx
= kxstart
; kx
< kxend
; kx
++)
469 m2inv
[kx
] = 1.0/m2
[kx
];
472 calc_exponentials_q(kxstart
, kxend
, elfac
,
473 ArrayRef
<PME_T
>(denom
, denom
+roundUpToMultipleOfFactor
<c_simdWidth
>(kxend
)),
474 ArrayRef
<PME_T
>(tmp1
, tmp1
+roundUpToMultipleOfFactor
<c_simdWidth
>(kxend
)),
475 ArrayRef
<PME_T
>(eterm
, eterm
+roundUpToMultipleOfFactor
<c_simdWidth
>(kxend
)));
477 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
482 p0
->re
= d1
*eterm
[kx
];
483 p0
->im
= d2
*eterm
[kx
];
485 struct2
= 2.0*(d1
*d1
+d2
*d2
);
487 tmp1
[kx
] = eterm
[kx
]*struct2
;
490 for (kx
= kxstart
; kx
< kxend
; kx
++)
492 ets2
= corner_fac
*tmp1
[kx
];
493 vfactor
= (factor
*m2
[kx
] + 1.0)*2.0*m2inv
[kx
];
496 ets2vf
= ets2
*vfactor
;
497 virxx
+= ets2vf
*mhx
[kx
]*mhx
[kx
] - ets2
;
498 virxy
+= ets2vf
*mhx
[kx
]*mhy
[kx
];
499 virxz
+= ets2vf
*mhx
[kx
]*mhz
[kx
];
500 viryy
+= ets2vf
*mhy
[kx
]*mhy
[kx
] - ets2
;
501 viryz
+= ets2vf
*mhy
[kx
]*mhz
[kx
];
502 virzz
+= ets2vf
*mhz
[kx
]*mhz
[kx
] - ets2
;
507 /* We don't need to calculate the energy and the virial.
508 * In this case the triclinic overhead is small.
511 /* Two explicit loops to avoid a conditional inside the loop */
513 for (kx
= kxstart
; kx
< maxkx
; kx
++)
518 mhyk
= mx
* ryx
+ my
* ryy
;
519 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
520 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
521 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
522 tmp1
[kx
] = -factor
*m2k
;
525 for (kx
= maxkx
; kx
< kxend
; kx
++)
530 mhyk
= mx
* ryx
+ my
* ryy
;
531 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
532 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
533 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
534 tmp1
[kx
] = -factor
*m2k
;
537 calc_exponentials_q(kxstart
, kxend
, elfac
,
538 ArrayRef
<PME_T
>(denom
, denom
+roundUpToMultipleOfFactor
<c_simdWidth
>(kxend
)),
539 ArrayRef
<PME_T
>(tmp1
, tmp1
+roundUpToMultipleOfFactor
<c_simdWidth
>(kxend
)),
540 ArrayRef
<PME_T
>(eterm
, eterm
+roundUpToMultipleOfFactor
<c_simdWidth
>(kxend
)));
543 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
548 p0
->re
= d1
*eterm
[kx
];
549 p0
->im
= d2
*eterm
[kx
];
556 /* Update virial with local values.
557 * The virial is symmetric by definition.
558 * this virial seems ok for isotropic scaling, but I'm
559 * experiencing problems on semiisotropic membranes.
560 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
562 work
->vir_q
[XX
][XX
] = 0.25*virxx
;
563 work
->vir_q
[YY
][YY
] = 0.25*viryy
;
564 work
->vir_q
[ZZ
][ZZ
] = 0.25*virzz
;
565 work
->vir_q
[XX
][YY
] = work
->vir_q
[YY
][XX
] = 0.25*virxy
;
566 work
->vir_q
[XX
][ZZ
] = work
->vir_q
[ZZ
][XX
] = 0.25*virxz
;
567 work
->vir_q
[YY
][ZZ
] = work
->vir_q
[ZZ
][YY
] = 0.25*viryz
;
569 /* This energy should be corrected for a charged system */
570 work
->energy_q
= 0.5*energy
;
573 /* Return the loop count */
574 return local_ndata
[YY
]*local_ndata
[XX
];
577 int solve_pme_lj_yzx(const gmx_pme_t
*pme
, t_complex
**grid
, gmx_bool bLB
, real vol
,
578 gmx_bool bEnerVir
, int nthread
, int thread
)
580 /* do recip sum over local cells in grid */
581 /* y major, z middle, x minor or continuous */
583 int kx
, ky
, kz
, maxkx
, maxky
;
584 int nx
, ny
, nz
, iy
, iyz0
, iyz1
, iyz
, iz
, kxstart
, kxend
;
586 real ewaldcoeff
= pme
->ewaldcoeff_lj
;
587 real factor
= M_PI
*M_PI
/(ewaldcoeff
*ewaldcoeff
);
589 real eterm
, vterm
, d1
, d2
, energy
= 0;
591 real virxx
= 0, virxy
= 0, virxz
= 0, viryy
= 0, viryz
= 0, virzz
= 0;
592 real rxx
, ryx
, ryy
, rzx
, rzy
, rzz
;
593 real
*mhx
, *mhy
, *mhz
, *m2
, *denom
, *tmp1
, *tmp2
;
594 real mhxk
, mhyk
, mhzk
, m2k
;
595 struct pme_solve_work_t
*work
;
598 ivec local_ndata
, local_offset
, local_size
;
603 /* Dimensions should be identical for A/B grid, so we just use A here */
604 gmx_parallel_3dfft_complex_limits(pme
->pfft_setup
[PME_GRID_C6A
],
609 rxx
= pme
->recipbox
[XX
][XX
];
610 ryx
= pme
->recipbox
[YY
][XX
];
611 ryy
= pme
->recipbox
[YY
][YY
];
612 rzx
= pme
->recipbox
[ZZ
][XX
];
613 rzy
= pme
->recipbox
[ZZ
][YY
];
614 rzz
= pme
->recipbox
[ZZ
][ZZ
];
619 work
= &pme
->solve_work
[thread
];
628 iyz0
= local_ndata
[YY
]*local_ndata
[ZZ
]* thread
/nthread
;
629 iyz1
= local_ndata
[YY
]*local_ndata
[ZZ
]*(thread
+1)/nthread
;
631 for (iyz
= iyz0
; iyz
< iyz1
; iyz
++)
633 iy
= iyz
/local_ndata
[ZZ
];
634 iz
= iyz
- iy
*local_ndata
[ZZ
];
636 ky
= iy
+ local_offset
[YY
];
647 by
= 3.0*vol
*pme
->bsp_mod
[YY
][ky
]
648 / (M_PI
*sqrt(M_PI
)*ewaldcoeff
*ewaldcoeff
*ewaldcoeff
);
650 kz
= iz
+ local_offset
[ZZ
];
654 bz
= pme
->bsp_mod
[ZZ
][kz
];
656 /* 0.5 correction for corner points */
658 if (kz
== 0 || kz
== (nz
+1)/2)
663 kxstart
= local_offset
[XX
];
664 kxend
= local_offset
[XX
] + local_ndata
[XX
];
667 /* More expensive inner loop, especially because of the
668 * storage of the mh elements in array's. Because x is the
669 * minor grid index, all mh elements depend on kx for
670 * triclinic unit cells.
673 /* Two explicit loops to avoid a conditional inside the loop */
674 for (kx
= kxstart
; kx
< maxkx
; kx
++)
679 mhyk
= mx
* ryx
+ my
* ryy
;
680 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
681 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
686 denom
[kx
] = bz
*by
*pme
->bsp_mod
[XX
][kx
];
687 tmp1
[kx
] = -factor
*m2k
;
688 tmp2
[kx
] = sqrt(factor
*m2k
);
691 for (kx
= maxkx
; kx
< kxend
; kx
++)
696 mhyk
= mx
* ryx
+ my
* ryy
;
697 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
698 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
703 denom
[kx
] = bz
*by
*pme
->bsp_mod
[XX
][kx
];
704 tmp1
[kx
] = -factor
*m2k
;
705 tmp2
[kx
] = sqrt(factor
*m2k
);
707 /* Clear padding elements to avoid (harmless) fp exceptions */
708 const int kxendSimd
= roundUpToMultipleOfFactor
<c_simdWidth
>(kxend
);
709 for (; kx
< kxendSimd
; kx
++)
715 calc_exponentials_lj(kxstart
, kxend
,
716 ArrayRef
<PME_T
>(tmp1
, tmp1
+roundUpToMultipleOfFactor
<c_simdWidth
>(kxend
)),
717 ArrayRef
<PME_T
>(tmp2
, tmp2
+roundUpToMultipleOfFactor
<c_simdWidth
>(kxend
)),
718 ArrayRef
<PME_T
>(denom
, denom
+roundUpToMultipleOfFactor
<c_simdWidth
>(kxend
)));
720 for (kx
= kxstart
; kx
< kxend
; kx
++)
723 eterm
= -((1.0 - 2.0*m2k
)*tmp1
[kx
]
725 vterm
= 3.0*(-tmp1
[kx
] + tmp2
[kx
]);
726 tmp1
[kx
] = eterm
*denom
[kx
];
727 tmp2
[kx
] = vterm
*denom
[kx
];
735 p0
= grid
[0] + iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
736 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
746 struct2
= 2.0*(d1
*d1
+d2
*d2
);
748 tmp1
[kx
] = eterm
*struct2
;
749 tmp2
[kx
] = vterm
*struct2
;
754 real
*struct2
= denom
;
757 for (kx
= kxstart
; kx
< kxend
; kx
++)
761 /* Due to symmetry we only need to calculate 4 of the 7 terms */
762 for (ig
= 0; ig
<= 3; ++ig
)
767 p0
= grid
[ig
] + iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
768 p1
= grid
[6-ig
] + iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
769 scale
= 2.0*lb_scale_factor_symm
[ig
];
770 for (kx
= kxstart
; kx
< kxend
; ++kx
, ++p0
, ++p1
)
772 struct2
[kx
] += scale
*(p0
->re
*p1
->re
+ p0
->im
*p1
->im
);
776 for (ig
= 0; ig
<= 6; ++ig
)
780 p0
= grid
[ig
] + iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
781 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
791 for (kx
= kxstart
; kx
< kxend
; kx
++)
796 tmp1
[kx
] = eterm
*str2
;
797 tmp2
[kx
] = vterm
*str2
;
801 for (kx
= kxstart
; kx
< kxend
; kx
++)
803 ets2
= corner_fac
*tmp1
[kx
];
804 vterm
= 2.0*factor
*tmp2
[kx
];
806 ets2vf
= corner_fac
*vterm
;
807 virxx
+= ets2vf
*mhx
[kx
]*mhx
[kx
] - ets2
;
808 virxy
+= ets2vf
*mhx
[kx
]*mhy
[kx
];
809 virxz
+= ets2vf
*mhx
[kx
]*mhz
[kx
];
810 viryy
+= ets2vf
*mhy
[kx
]*mhy
[kx
] - ets2
;
811 viryz
+= ets2vf
*mhy
[kx
]*mhz
[kx
];
812 virzz
+= ets2vf
*mhz
[kx
]*mhz
[kx
] - ets2
;
817 /* We don't need to calculate the energy and the virial.
818 * In this case the triclinic overhead is small.
821 /* Two explicit loops to avoid a conditional inside the loop */
823 for (kx
= kxstart
; kx
< maxkx
; kx
++)
828 mhyk
= mx
* ryx
+ my
* ryy
;
829 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
830 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
832 denom
[kx
] = bz
*by
*pme
->bsp_mod
[XX
][kx
];
833 tmp1
[kx
] = -factor
*m2k
;
834 tmp2
[kx
] = sqrt(factor
*m2k
);
837 for (kx
= maxkx
; kx
< kxend
; kx
++)
842 mhyk
= mx
* ryx
+ my
* ryy
;
843 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
844 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
846 denom
[kx
] = bz
*by
*pme
->bsp_mod
[XX
][kx
];
847 tmp1
[kx
] = -factor
*m2k
;
848 tmp2
[kx
] = sqrt(factor
*m2k
);
850 /* Clear padding elements to avoid (harmless) fp exceptions */
851 const int kxendSimd
= roundUpToMultipleOfFactor
<c_simdWidth
>(kxend
);
852 for (; kx
< kxendSimd
; kx
++)
858 calc_exponentials_lj(kxstart
, kxend
,
859 ArrayRef
<PME_T
>(tmp1
, tmp1
+roundUpToMultipleOfFactor
<c_simdWidth
>(kxend
)),
860 ArrayRef
<PME_T
>(tmp2
, tmp2
+roundUpToMultipleOfFactor
<c_simdWidth
>(kxend
)),
861 ArrayRef
<PME_T
>(denom
, denom
+roundUpToMultipleOfFactor
<c_simdWidth
>(kxend
)));
863 for (kx
= kxstart
; kx
< kxend
; kx
++)
866 eterm
= -((1.0 - 2.0*m2k
)*tmp1
[kx
]
868 tmp1
[kx
] = eterm
*denom
[kx
];
870 gcount
= (bLB
? 7 : 1);
871 for (ig
= 0; ig
< gcount
; ++ig
)
875 p0
= grid
[ig
] + iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
876 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
891 work
->vir_lj
[XX
][XX
] = 0.25*virxx
;
892 work
->vir_lj
[YY
][YY
] = 0.25*viryy
;
893 work
->vir_lj
[ZZ
][ZZ
] = 0.25*virzz
;
894 work
->vir_lj
[XX
][YY
] = work
->vir_lj
[YY
][XX
] = 0.25*virxy
;
895 work
->vir_lj
[XX
][ZZ
] = work
->vir_lj
[ZZ
][XX
] = 0.25*virxz
;
896 work
->vir_lj
[YY
][ZZ
] = work
->vir_lj
[ZZ
][YY
] = 0.25*viryz
;
898 /* This energy should be corrected for a charged system */
899 work
->energy_lj
= 0.5*energy
;
901 /* Return the loop count */
902 return local_ndata
[YY
]*local_ndata
[XX
];