2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
38 /* TODO find out what this file should be called */
47 #include "gromacs/ewald/pme.h"
48 #include "gromacs/fft/parallel_3dfft.h"
49 #include "gromacs/math/vec.h"
50 #include "gromacs/timing/cyclecounter.h"
51 #include "gromacs/utility/fatalerror.h"
52 #include "gromacs/utility/smalloc.h"
54 #include "pme_internal.h"
57 # include "gromacs/fileio/pdbio.h"
58 # include "gromacs/utility/cstringutil.h"
59 # include "gromacs/utility/futil.h"
64 /* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
65 * to preserve alignment.
67 #define GMX_CACHE_SEP 64
69 void gmx_sum_qgrid_dd(gmx_pme_t
* pme
, real
* grid
, const int direction
)
72 pme_overlap_t
* overlap
;
73 int send_index0
, send_nindex
;
74 int recv_index0
, recv_nindex
;
76 int i
, j
, k
, ix
, iy
, iz
, icnt
;
77 int send_id
, recv_id
, datasize
;
79 real
* sendptr
, *recvptr
;
81 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
82 overlap
= &pme
->overlap
[1];
84 for (size_t ipulse
= 0; ipulse
< overlap
->comm_data
.size(); ipulse
++)
86 /* Since we have already (un)wrapped the overlap in the z-dimension,
87 * we only have to communicate 0 to nkz (not pmegrid_nz).
89 if (direction
== GMX_SUM_GRID_FORWARD
)
91 send_id
= overlap
->comm_data
[ipulse
].send_id
;
92 recv_id
= overlap
->comm_data
[ipulse
].recv_id
;
93 send_index0
= overlap
->comm_data
[ipulse
].send_index0
;
94 send_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
95 recv_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
96 recv_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
100 send_id
= overlap
->comm_data
[ipulse
].recv_id
;
101 recv_id
= overlap
->comm_data
[ipulse
].send_id
;
102 send_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
103 send_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
104 recv_index0
= overlap
->comm_data
[ipulse
].send_index0
;
105 recv_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
108 /* Copy data to contiguous send buffer */
111 fprintf(debug
, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
112 pme
->nodeid
, overlap
->nodeid
, send_id
, pme
->pmegrid_start_iy
,
113 send_index0
- pme
->pmegrid_start_iy
,
114 send_index0
- pme
->pmegrid_start_iy
+ send_nindex
);
117 for (i
= 0; i
< pme
->pmegrid_nx
; i
++)
120 for (j
= 0; j
< send_nindex
; j
++)
122 iy
= j
+ send_index0
- pme
->pmegrid_start_iy
;
123 for (k
= 0; k
< pme
->nkz
; k
++)
126 overlap
->sendbuf
[icnt
++] =
127 grid
[ix
* (pme
->pmegrid_ny
* pme
->pmegrid_nz
) + iy
* (pme
->pmegrid_nz
) + iz
];
132 datasize
= pme
->pmegrid_nx
* pme
->nkz
;
134 MPI_Sendrecv(overlap
->sendbuf
.data(), send_nindex
* datasize
, GMX_MPI_REAL
, send_id
, ipulse
,
135 overlap
->recvbuf
.data(), recv_nindex
* datasize
, GMX_MPI_REAL
, recv_id
, ipulse
,
136 overlap
->mpi_comm
, &stat
);
138 /* Get data from contiguous recv buffer */
141 fprintf(debug
, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
142 pme
->nodeid
, overlap
->nodeid
, recv_id
, pme
->pmegrid_start_iy
,
143 recv_index0
- pme
->pmegrid_start_iy
,
144 recv_index0
- pme
->pmegrid_start_iy
+ recv_nindex
);
147 for (i
= 0; i
< pme
->pmegrid_nx
; i
++)
150 for (j
= 0; j
< recv_nindex
; j
++)
152 iy
= j
+ recv_index0
- pme
->pmegrid_start_iy
;
153 for (k
= 0; k
< pme
->nkz
; k
++)
156 if (direction
== GMX_SUM_GRID_FORWARD
)
158 grid
[ix
* (pme
->pmegrid_ny
* pme
->pmegrid_nz
) + iy
* (pme
->pmegrid_nz
) + iz
] +=
159 overlap
->recvbuf
[icnt
++];
163 grid
[ix
* (pme
->pmegrid_ny
* pme
->pmegrid_nz
) + iy
* (pme
->pmegrid_nz
) + iz
] =
164 overlap
->recvbuf
[icnt
++];
171 /* Major dimension is easier, no copying required,
172 * but we might have to sum to separate array.
173 * Since we don't copy, we have to communicate up to pmegrid_nz,
174 * not nkz as for the minor direction.
176 overlap
= &pme
->overlap
[0];
178 for (size_t ipulse
= 0; ipulse
< overlap
->comm_data
.size(); ipulse
++)
180 if (direction
== GMX_SUM_GRID_FORWARD
)
182 send_id
= overlap
->comm_data
[ipulse
].send_id
;
183 recv_id
= overlap
->comm_data
[ipulse
].recv_id
;
184 send_index0
= overlap
->comm_data
[ipulse
].send_index0
;
185 send_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
186 recv_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
187 recv_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
188 recvptr
= overlap
->recvbuf
.data();
192 send_id
= overlap
->comm_data
[ipulse
].recv_id
;
193 recv_id
= overlap
->comm_data
[ipulse
].send_id
;
194 send_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
195 send_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
196 recv_index0
= overlap
->comm_data
[ipulse
].send_index0
;
197 recv_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
198 recvptr
= grid
+ (recv_index0
- pme
->pmegrid_start_ix
) * (pme
->pmegrid_ny
* pme
->pmegrid_nz
);
201 sendptr
= grid
+ (send_index0
- pme
->pmegrid_start_ix
) * (pme
->pmegrid_ny
* pme
->pmegrid_nz
);
202 datasize
= pme
->pmegrid_ny
* pme
->pmegrid_nz
;
206 fprintf(debug
, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
207 pme
->nodeid
, overlap
->nodeid
, send_id
, pme
->pmegrid_start_ix
,
208 send_index0
- pme
->pmegrid_start_ix
,
209 send_index0
- pme
->pmegrid_start_ix
+ send_nindex
);
210 fprintf(debug
, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
211 pme
->nodeid
, overlap
->nodeid
, recv_id
, pme
->pmegrid_start_ix
,
212 recv_index0
- pme
->pmegrid_start_ix
,
213 recv_index0
- pme
->pmegrid_start_ix
+ recv_nindex
);
216 MPI_Sendrecv(sendptr
, send_nindex
* datasize
, GMX_MPI_REAL
, send_id
, ipulse
, recvptr
,
217 recv_nindex
* datasize
, GMX_MPI_REAL
, recv_id
, ipulse
, overlap
->mpi_comm
, &stat
);
219 /* ADD data from contiguous recv buffer */
220 if (direction
== GMX_SUM_GRID_FORWARD
)
222 p
= grid
+ (recv_index0
- pme
->pmegrid_start_ix
) * (pme
->pmegrid_ny
* pme
->pmegrid_nz
);
223 for (i
= 0; i
< recv_nindex
* datasize
; i
++)
225 p
[i
] += overlap
->recvbuf
[i
];
230 GMX_UNUSED_VALUE(pme
);
231 GMX_UNUSED_VALUE(grid
);
232 GMX_UNUSED_VALUE(direction
);
234 GMX_RELEASE_ASSERT(false, "gmx_sum_qgrid_dd() should not be called without MPI");
239 int copy_pmegrid_to_fftgrid(const gmx_pme_t
* pme
, const real
* pmegrid
, real
* fftgrid
, int grid_index
)
241 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
246 /* Dimensions should be identical for A/B grid, so we just use A here */
247 gmx_parallel_3dfft_real_limits(pme
->pfft_setup
[grid_index
], local_fft_ndata
, local_fft_offset
,
250 local_pme_size
[0] = pme
->pmegrid_nx
;
251 local_pme_size
[1] = pme
->pmegrid_ny
;
252 local_pme_size
[2] = pme
->pmegrid_nz
;
254 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
255 the offset is identical, and the PME grid always has more data (due to overlap)
262 sprintf(fn
, "pmegrid%d.pdb", pme
->nodeid
);
263 fp
= gmx_ffopen(fn
, "w");
264 sprintf(fn
, "pmegrid%d.txt", pme
->nodeid
);
265 fp2
= gmx_ffopen(fn
, "w");
268 for (ix
= 0; ix
< local_fft_ndata
[XX
]; ix
++)
270 for (iy
= 0; iy
< local_fft_ndata
[YY
]; iy
++)
272 for (iz
= 0; iz
< local_fft_ndata
[ZZ
]; iz
++)
274 pmeidx
= ix
* (local_pme_size
[YY
] * local_pme_size
[ZZ
])
275 + iy
* (local_pme_size
[ZZ
]) + iz
;
276 fftidx
= ix
* (local_fft_size
[YY
] * local_fft_size
[ZZ
])
277 + iy
* (local_fft_size
[ZZ
]) + iz
;
278 fftgrid
[fftidx
] = pmegrid
[pmeidx
];
280 val
= 100 * pmegrid
[pmeidx
];
281 if (pmegrid
[pmeidx
] != 0)
283 gmx_fprintf_pdb_atomline(fp
, epdbATOM
, pmeidx
, "CA", ' ', "GLY", ' ', pmeidx
,
284 ' ', 5.0 * ix
, 5.0 * iy
, 5.0 * iz
, 1.0, val
, "");
286 if (pmegrid
[pmeidx
] != 0)
288 fprintf(fp2
, "%-12s %5d %5d %5d %12.5e\n", "qgrid",
289 pme
->pmegrid_start_ix
+ ix
, pme
->pmegrid_start_iy
+ iy
,
290 pme
->pmegrid_start_iz
+ iz
, pmegrid
[pmeidx
]);
305 #ifdef PME_TIME_THREADS
306 static gmx_cycles_t
omp_cyc_start()
308 return gmx_cycles_read();
311 static gmx_cycles_t
omp_cyc_end(gmx_cycles_t c
)
313 return gmx_cycles_read() - c
;
318 int copy_fftgrid_to_pmegrid(struct gmx_pme_t
* pme
, const real
* fftgrid
, real
* pmegrid
, int grid_index
, int nthread
, int thread
)
320 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
322 int ixy0
, ixy1
, ixy
, ix
, iy
, iz
;
324 #ifdef PME_TIME_THREADS
326 static double cs1
= 0;
330 #ifdef PME_TIME_THREADS
331 c1
= omp_cyc_start();
333 /* Dimensions should be identical for A/B grid, so we just use A here */
334 gmx_parallel_3dfft_real_limits(pme
->pfft_setup
[grid_index
], local_fft_ndata
, local_fft_offset
,
337 local_pme_size
[0] = pme
->pmegrid_nx
;
338 local_pme_size
[1] = pme
->pmegrid_ny
;
339 local_pme_size
[2] = pme
->pmegrid_nz
;
341 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
342 the offset is identical, and the PME grid always has more data (due to overlap)
344 ixy0
= ((thread
)*local_fft_ndata
[XX
] * local_fft_ndata
[YY
]) / nthread
;
345 ixy1
= ((thread
+ 1) * local_fft_ndata
[XX
] * local_fft_ndata
[YY
]) / nthread
;
347 for (ixy
= ixy0
; ixy
< ixy1
; ixy
++)
349 ix
= ixy
/ local_fft_ndata
[YY
];
350 iy
= ixy
- ix
* local_fft_ndata
[YY
];
352 pmeidx
= (ix
* local_pme_size
[YY
] + iy
) * local_pme_size
[ZZ
];
353 fftidx
= (ix
* local_fft_size
[YY
] + iy
) * local_fft_size
[ZZ
];
354 for (iz
= 0; iz
< local_fft_ndata
[ZZ
]; iz
++)
356 pmegrid
[pmeidx
+ iz
] = fftgrid
[fftidx
+ iz
];
360 #ifdef PME_TIME_THREADS
361 c1
= omp_cyc_end(c1
);
366 printf("copy %.2f\n", cs1
* 1e-9);
374 void wrap_periodic_pmegrid(const gmx_pme_t
* pme
, real
* pmegrid
)
376 int nx
, ny
, nz
, pny
, pnz
, ny_x
, overlap
, ix
, iy
, iz
;
382 pny
= pme
->pmegrid_ny
;
383 pnz
= pme
->pmegrid_nz
;
385 overlap
= pme
->pme_order
- 1;
387 /* Add periodic overlap in z */
388 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
390 for (iy
= 0; iy
< pme
->pmegrid_ny
; iy
++)
392 for (iz
= 0; iz
< overlap
; iz
++)
394 pmegrid
[(ix
* pny
+ iy
) * pnz
+ iz
] += pmegrid
[(ix
* pny
+ iy
) * pnz
+ nz
+ iz
];
399 if (pme
->nnodes_minor
== 1)
401 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
403 for (iy
= 0; iy
< overlap
; iy
++)
405 for (iz
= 0; iz
< nz
; iz
++)
407 pmegrid
[(ix
* pny
+ iy
) * pnz
+ iz
] += pmegrid
[(ix
* pny
+ ny
+ iy
) * pnz
+ iz
];
413 if (pme
->nnodes_major
== 1)
415 ny_x
= (pme
->nnodes_minor
== 1 ? ny
: pme
->pmegrid_ny
);
417 for (ix
= 0; ix
< overlap
; ix
++)
419 for (iy
= 0; iy
< ny_x
; iy
++)
421 for (iz
= 0; iz
< nz
; iz
++)
423 pmegrid
[(ix
* pny
+ iy
) * pnz
+ iz
] += pmegrid
[((nx
+ ix
) * pny
+ iy
) * pnz
+ iz
];
431 void unwrap_periodic_pmegrid(struct gmx_pme_t
* pme
, real
* pmegrid
)
433 int nx
, ny
, nz
, pny
, pnz
, ny_x
, overlap
, ix
;
439 pny
= pme
->pmegrid_ny
;
440 pnz
= pme
->pmegrid_nz
;
442 overlap
= pme
->pme_order
- 1;
444 if (pme
->nnodes_major
== 1)
446 ny_x
= (pme
->nnodes_minor
== 1 ? ny
: pme
->pmegrid_ny
);
448 for (ix
= 0; ix
< overlap
; ix
++)
452 for (iy
= 0; iy
< ny_x
; iy
++)
454 for (iz
= 0; iz
< nz
; iz
++)
456 pmegrid
[((nx
+ ix
) * pny
+ iy
) * pnz
+ iz
] = pmegrid
[(ix
* pny
+ iy
) * pnz
+ iz
];
462 if (pme
->nnodes_minor
== 1)
464 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
465 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
467 // Trivial OpenMP region that does not throw, no need for try/catch
470 for (iy
= 0; iy
< overlap
; iy
++)
472 for (iz
= 0; iz
< nz
; iz
++)
474 pmegrid
[(ix
* pny
+ ny
+ iy
) * pnz
+ iz
] = pmegrid
[(ix
* pny
+ iy
) * pnz
+ iz
];
480 /* Copy periodic overlap in z */
481 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
482 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
484 // Trivial OpenMP region that does not throw, no need for try/catch
487 for (iy
= 0; iy
< pme
->pmegrid_ny
; iy
++)
489 for (iz
= 0; iz
< overlap
; iz
++)
491 pmegrid
[(ix
* pny
+ iy
) * pnz
+ nz
+ iz
] = pmegrid
[(ix
* pny
+ iy
) * pnz
+ iz
];
497 void set_grid_alignment(int gmx_unused
* pmegrid_nz
, int gmx_unused pme_order
)
499 #ifdef PME_SIMD4_SPREAD_GATHER
501 # if !PME_4NSIMD_GATHER
506 /* Round nz up to a multiple of 4 to ensure alignment */
507 *pmegrid_nz
= ((*pmegrid_nz
+ 3) & ~3);
512 static void set_gridsize_alignment(int gmx_unused
* gridsize
, int gmx_unused pme_order
)
514 #ifdef PME_SIMD4_SPREAD_GATHER
515 # if !PME_4NSIMD_GATHER
518 /* Add extra elements to ensured aligned operations do not go
519 * beyond the allocated grid size.
520 * Note that for pme_order=5, the pme grid z-size alignment
521 * ensures that we will not go beyond the grid size.
529 void pmegrid_init(pmegrid_t
* grid
,
539 gmx_bool set_alignment
,
548 grid
->offset
[XX
] = x0
;
549 grid
->offset
[YY
] = y0
;
550 grid
->offset
[ZZ
] = z0
;
551 grid
->n
[XX
] = x1
- x0
+ pme_order
- 1;
552 grid
->n
[YY
] = y1
- y0
+ pme_order
- 1;
553 grid
->n
[ZZ
] = z1
- z0
+ pme_order
- 1;
554 copy_ivec(grid
->n
, grid
->s
);
557 set_grid_alignment(&nz
, pme_order
);
562 else if (nz
!= grid
->s
[ZZ
])
564 gmx_incons("pmegrid_init call with an unaligned z size");
567 grid
->order
= pme_order
;
570 gridsize
= grid
->s
[XX
] * grid
->s
[YY
] * grid
->s
[ZZ
];
571 set_gridsize_alignment(&gridsize
, pme_order
);
572 snew_aligned(grid
->grid
, gridsize
, SIMD4_ALIGNMENT
);
580 static int div_round_up(int enumerator
, int denominator
)
582 return (enumerator
+ denominator
- 1) / denominator
;
585 static void make_subgrid_division(const ivec n
, int ovl
, int nthread
, ivec nsub
)
587 int gsize_opt
, gsize
;
592 for (nsx
= 1; nsx
<= nthread
; nsx
++)
594 if (nthread
% nsx
== 0)
596 for (nsy
= 1; nsy
<= nthread
; nsy
++)
598 if (nsx
* nsy
<= nthread
&& nthread
% (nsx
* nsy
) == 0)
600 nsz
= nthread
/ (nsx
* nsy
);
602 /* Determine the number of grid points per thread */
603 gsize
= (div_round_up(n
[XX
], nsx
) + ovl
) * (div_round_up(n
[YY
], nsy
) + ovl
)
604 * (div_round_up(n
[ZZ
], nsz
) + ovl
);
606 /* Minimize the number of grids points per thread
607 * and, secondarily, the number of cuts in minor dimensions.
609 if (gsize_opt
== -1 || gsize
< gsize_opt
610 || (gsize
== gsize_opt
&& (nsz
< nsub
[ZZ
] || (nsz
== nsub
[ZZ
] && nsy
< nsub
[YY
]))))
622 env
= getenv("GMX_PME_THREAD_DIVISION");
625 sscanf(env
, "%20d %20d %20d", &nsub
[XX
], &nsub
[YY
], &nsub
[ZZ
]);
628 if (nsub
[XX
] * nsub
[YY
] * nsub
[ZZ
] != nthread
)
631 "PME grid thread division (%d x %d x %d) does not match the total number of "
633 nsub
[XX
], nsub
[YY
], nsub
[ZZ
], nthread
);
637 void pmegrids_init(pmegrids_t
* grids
,
643 gmx_bool bUseThreads
,
649 int t
, x
, y
, z
, d
, i
, tfac
;
650 int max_comm_lines
= -1;
652 n
[XX
] = nx
- (pme_order
- 1);
653 n
[YY
] = ny
- (pme_order
- 1);
654 n
[ZZ
] = nz
- (pme_order
- 1);
656 copy_ivec(n
, n_base
);
657 n_base
[ZZ
] = nz_base
;
659 pmegrid_init(&grids
->grid
, 0, 0, 0, 0, 0, 0, n
[XX
], n
[YY
], n
[ZZ
], FALSE
, pme_order
, nullptr);
661 grids
->nthread
= nthread
;
663 make_subgrid_division(n_base
, pme_order
- 1, grids
->nthread
, grids
->nc
);
670 for (d
= 0; d
< DIM
; d
++)
672 nst
[d
] = div_round_up(n
[d
], grids
->nc
[d
]) + pme_order
- 1;
674 set_grid_alignment(&nst
[ZZ
], pme_order
);
678 fprintf(debug
, "pmegrid thread local division: %d x %d x %d\n", grids
->nc
[XX
],
679 grids
->nc
[YY
], grids
->nc
[ZZ
]);
680 fprintf(debug
, "pmegrid %d %d %d max thread pmegrid %d %d %d\n", nx
, ny
, nz
, nst
[XX
],
684 snew(grids
->grid_th
, grids
->nthread
);
686 gridsize
= nst
[XX
] * nst
[YY
] * nst
[ZZ
];
687 set_gridsize_alignment(&gridsize
, pme_order
);
688 snew_aligned(grids
->grid_all
, grids
->nthread
* gridsize
+ (grids
->nthread
+ 1) * GMX_CACHE_SEP
,
691 for (x
= 0; x
< grids
->nc
[XX
]; x
++)
693 for (y
= 0; y
< grids
->nc
[YY
]; y
++)
695 for (z
= 0; z
< grids
->nc
[ZZ
]; z
++)
697 pmegrid_init(&grids
->grid_th
[t
], x
, y
, z
, (n
[XX
] * (x
)) / grids
->nc
[XX
],
698 (n
[YY
] * (y
)) / grids
->nc
[YY
], (n
[ZZ
] * (z
)) / grids
->nc
[ZZ
],
699 (n
[XX
] * (x
+ 1)) / grids
->nc
[XX
], (n
[YY
] * (y
+ 1)) / grids
->nc
[YY
],
700 (n
[ZZ
] * (z
+ 1)) / grids
->nc
[ZZ
], TRUE
, pme_order
,
701 grids
->grid_all
+ GMX_CACHE_SEP
+ t
* (gridsize
+ GMX_CACHE_SEP
));
709 grids
->grid_th
= nullptr;
713 for (d
= DIM
- 1; d
>= 0; d
--)
715 snew(grids
->g2t
[d
], n
[d
]);
717 for (i
= 0; i
< n
[d
]; i
++)
719 /* The second check should match the parameters
720 * of the pmegrid_init call above.
722 while (t
+ 1 < grids
->nc
[d
] && i
>= (n
[d
] * (t
+ 1)) / grids
->nc
[d
])
726 grids
->g2t
[d
][i
] = t
* tfac
;
729 tfac
*= grids
->nc
[d
];
733 case XX
: max_comm_lines
= overlap_x
; break;
734 case YY
: max_comm_lines
= overlap_y
; break;
735 case ZZ
: max_comm_lines
= pme_order
- 1; break;
737 grids
->nthread_comm
[d
] = 0;
738 while ((n
[d
] * grids
->nthread_comm
[d
]) / grids
->nc
[d
] < max_comm_lines
739 && grids
->nthread_comm
[d
] < grids
->nc
[d
])
741 grids
->nthread_comm
[d
]++;
743 if (debug
!= nullptr)
745 fprintf(debug
, "pmegrid thread grid communication range in %c: %d\n", 'x' + d
,
746 grids
->nthread_comm
[d
]);
748 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
749 * work, but this is not a problematic restriction.
751 if (grids
->nc
[d
] > 1 && grids
->nthread_comm
[d
] > grids
->nc
[d
])
754 "Too many threads for PME (%d) compared to the number of grid lines, reduce "
755 "the number of threads doing PME",
761 void pmegrids_destroy(pmegrids_t
* grids
)
763 if (grids
->grid
.grid
!= nullptr)
765 sfree_aligned(grids
->grid
.grid
);
767 if (grids
->nthread
> 0)
769 sfree_aligned(grids
->grid_all
);
770 sfree(grids
->grid_th
);
772 for (int d
= 0; d
< DIM
; d
++)
774 sfree(grids
->g2t
[d
]);
779 void make_gridindex_to_localindex(int n
, int local_start
, int local_range
, int** global_to_local
, real
** fraction_shift
)
781 /* Here we construct array for looking up the grid line index and
782 * fraction for particles. This is done because it is slighlty
783 * faster than the modulo operation and to because we need to take
784 * care of rounding issues, see below.
785 * We use an array size of c_pmeNeighborUnitcellCount times the grid size
786 * to allow for particles to be out of the triclinic unit-cell.
788 const int arraySize
= c_pmeNeighborUnitcellCount
* n
;
792 snew(gtl
, arraySize
);
793 snew(fsh
, arraySize
);
795 for (int i
= 0; i
< arraySize
; i
++)
797 /* Transform global grid index to the local grid index.
798 * Our local grid always runs from 0 to local_range-1.
800 gtl
[i
] = (i
- local_start
+ n
) % n
;
801 /* For coordinates that fall within the local grid the fraction
802 * is correct, we don't need to shift it.
805 /* Check if we are using domain decomposition for PME */
808 /* Due to rounding issues i could be 1 beyond the lower or
809 * upper boundary of the local grid. Correct the index for this.
810 * If we shift the index, we need to shift the fraction by
811 * the same amount in the other direction to not affect
813 * Note that due to this shifting the weights at the end of
814 * the spline might change, but that will only involve values
815 * between zero and values close to the precision of a real,
816 * which is anyhow the accuracy of the whole mesh calculation.
820 /* When this i is used, we should round the local index up */
824 else if (gtl
[i
] == local_range
&& local_range
> 0)
826 /* When this i is used, we should round the local index down */
827 gtl
[i
] = local_range
- 1;
833 *global_to_local
= gtl
;
834 *fraction_shift
= fsh
;
837 void reuse_pmegrids(const pmegrids_t
* oldgrid
, pmegrids_t
* newgrid
)
841 for (d
= 0; d
< DIM
; d
++)
843 if (newgrid
->grid
.n
[d
] > oldgrid
->grid
.n
[d
])
849 sfree_aligned(newgrid
->grid
.grid
);
850 newgrid
->grid
.grid
= oldgrid
->grid
.grid
;
852 if (newgrid
->grid_th
!= nullptr && newgrid
->nthread
== oldgrid
->nthread
)
854 sfree_aligned(newgrid
->grid_all
);
855 newgrid
->grid_all
= oldgrid
->grid_all
;
856 for (t
= 0; t
< newgrid
->nthread
; t
++)
858 newgrid
->grid_th
[t
].grid
= oldgrid
->grid_th
[t
].grid
;