2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012,2013, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
38 /* IMPORTANT FOR DEVELOPERS:
40 * Triclinic pme stuff isn't entirely trivial, and we've experienced
41 * some bugs during development (many of them due to me). To avoid
42 * this in the future, please check the following things if you make
43 * changes in this file:
45 * 1. You should obtain identical (at least to the PME precision)
46 * energies, forces, and virial for
47 * a rectangular box and a triclinic one where the z (or y) axis is
48 * tilted a whole box side. For instance you could use these boxes:
50 * rectangular triclinic
55 * 2. You should check the energy conservation in a triclinic box.
57 * It might seem an overkill, but better safe than sorry.
79 #include "gmxcomplex.h"
83 #include "gmx_fatal.h"
89 #include "gmx_wallcycle.h"
90 #include "gmx_parallel_3dfft.h"
92 #include "gmx_cyclecounter.h"
95 /* Include the SIMD macro file and then check for support */
96 #include "gmx_simd_macros.h"
97 #if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_EXP
98 /* Turn on arbitrary width SIMD intrinsics for PME solve */
102 /* Include the 4-wide SIMD macro file */
103 #include "gmx_simd4_macros.h"
104 /* Check if we have 4-wide SIMD macro support */
105 #ifdef GMX_HAVE_SIMD4_MACROS
106 /* Do PME spread and gather with 4-wide SIMD.
107 * NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
109 #define PME_SIMD4_SPREAD_GATHER
111 #ifdef GMX_SIMD4_HAVE_UNALIGNED
112 /* With PME-order=4 on x86, unaligned load+store is slightly faster
113 * than doubling all SIMD operations when using aligned load+store.
115 #define PME_SIMD4_UNALIGNED
120 #include "mpelogging.h"
123 /* #define PRT_FORCE */
124 /* conditions for on the fly time-measurement */
125 /* #define TAKETIME (step > 1 && timesteps < 10) */
126 #define TAKETIME FALSE
128 /* #define PME_TIME_THREADS */
131 #define mpi_type MPI_DOUBLE
133 #define mpi_type MPI_FLOAT
136 #ifdef PME_SIMD4_SPREAD_GATHER
137 #define SIMD4_ALIGNMENT (GMX_SIMD4_WIDTH*sizeof(real))
139 /* We can use any alignment, apart from 0, so we use 4 reals */
140 #define SIMD4_ALIGNMENT (4*sizeof(real))
143 /* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
144 * to preserve alignment.
146 #define GMX_CACHE_SEP 64
148 /* We only define a maximum to be able to use local arrays without allocation.
149 * An order larger than 12 should never be needed, even for test cases.
150 * If needed it can be changed here.
152 #define PME_ORDER_MAX 12
154 /* Internal datastructures */
160 int recv_size
; /* Receive buffer width, used with OpenMP */
171 int *send_id
, *recv_id
;
172 int send_size
; /* Send buffer width, used with OpenMP */
173 pme_grid_comm_t
*comm_data
;
179 int *n
; /* Cumulative counts of the number of particles per thread */
180 int nalloc
; /* Allocation size of i */
181 int *i
; /* Particle indices ordered on thread index (n) */
195 int dimind
; /* The index of the dimension, 0=x, 1=y */
202 int *node_dest
; /* The nodes to send x and q to with DD */
203 int *node_src
; /* The nodes to receive x and q from with DD */
204 int *buf_index
; /* Index for commnode into the buffers */
211 int *count
; /* The number of atoms to send to each node */
213 int *rcount
; /* The number of atoms to receive */
220 gmx_bool bSpread
; /* These coordinates are used for spreading */
223 rvec
*fractx
; /* Fractional coordinate relative to the
224 * lower cell boundary
227 int *thread_idx
; /* Which thread should spread which charge */
228 thread_plist_t
*thread_plist
;
229 splinedata_t
*spline
;
236 ivec ci
; /* The spatial location of this grid */
237 ivec n
; /* The used size of *grid, including order-1 */
238 ivec offset
; /* The grid offset from the full node grid */
239 int order
; /* PME spreading order */
240 ivec s
; /* The allocated size of *grid, s >= n */
241 real
*grid
; /* The grid local thread, size n */
245 pmegrid_t grid
; /* The full node grid (non thread-local) */
246 int nthread
; /* The number of threads operating on this grid */
247 ivec nc
; /* The local spatial decomposition over the threads */
248 pmegrid_t
*grid_th
; /* Array of grids for each thread */
249 real
*grid_all
; /* Allocated array for the grids in *grid_th */
250 int **g2t
; /* The grid to thread index */
251 ivec nthread_comm
; /* The number of threads to communicate with */
256 #ifdef PME_SIMD4_SPREAD_GATHER
257 /* Masks for 4-wide SIMD aligned spreading and gathering */
258 gmx_simd4_pb mask_S0
[6], mask_S1
[6];
260 int dummy
; /* C89 requires that struct has at least one member */
265 /* work data for solve_pme */
281 typedef struct gmx_pme
{
282 int ndecompdim
; /* The number of decomposition dimensions */
283 int nodeid
; /* Our nodeid in mpi->mpi_comm */
286 int nnodes
; /* The number of nodes doing PME */
291 MPI_Comm mpi_comm_d
[2]; /* Indexed on dimension, 0=x, 1=y */
293 MPI_Datatype rvec_mpi
; /* the pme vector's MPI type */
296 gmx_bool bUseThreads
; /* Does any of the PME ranks have nthread>1 ? */
297 int nthread
; /* The number of threads doing PME on our rank */
299 gmx_bool bPPnode
; /* Node also does particle-particle forces */
300 gmx_bool bFEP
; /* Compute Free energy contribution */
301 int nkx
, nky
, nkz
; /* Grid dimensions */
302 gmx_bool bP3M
; /* Do P3M: optimize the influence function */
306 pmegrids_t pmegridA
; /* Grids on which we do spreading/interpolation, includes overlap */
308 /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
309 int pmegrid_nx
, pmegrid_ny
, pmegrid_nz
;
310 /* pmegrid_nz might be larger than strictly necessary to ensure
311 * memory alignment, pmegrid_nz_base gives the real base size.
314 /* The local PME grid starting indices */
315 int pmegrid_start_ix
, pmegrid_start_iy
, pmegrid_start_iz
;
317 /* Work data for spreading and gathering */
318 pme_spline_work_t
*spline_work
;
320 real
*fftgridA
; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
321 real
*fftgridB
; /* inside the interpolation grid, but separate for 2D PME decomp. */
322 int fftgrid_nx
, fftgrid_ny
, fftgrid_nz
;
324 t_complex
*cfftgridA
; /* Grids for complex FFT data */
325 t_complex
*cfftgridB
;
326 int cfftgrid_nx
, cfftgrid_ny
, cfftgrid_nz
;
328 gmx_parallel_3dfft_t pfft_setupA
;
329 gmx_parallel_3dfft_t pfft_setupB
;
331 int *nnx
, *nny
, *nnz
;
332 real
*fshx
, *fshy
, *fshz
;
334 pme_atomcomm_t atc
[2]; /* Indexed on decomposition index */
338 pme_overlap_t overlap
[2]; /* Indexed on dimension, 0=x, 1=y */
340 pme_atomcomm_t atc_energy
; /* Only for gmx_pme_calc_energy */
342 rvec
*bufv
; /* Communication buffer */
343 real
*bufr
; /* Communication buffer */
344 int buf_nalloc
; /* The communication buffer size */
346 /* thread local work data for solve_pme */
349 /* Work data for PME_redist */
350 gmx_bool redist_init
;
358 int redist_buf_nalloc
;
360 /* Work data for sum_qgrid */
361 real
* sum_qgrid_tmp
;
362 real
* sum_qgrid_dd_tmp
;
366 static void calc_interpolation_idx(gmx_pme_t pme
, pme_atomcomm_t
*atc
,
367 int start
, int end
, int thread
)
370 int *idxptr
, tix
, tiy
, tiz
;
371 real
*xptr
, *fptr
, tx
, ty
, tz
;
372 real rxx
, ryx
, ryy
, rzx
, rzy
, rzz
;
374 int start_ix
, start_iy
, start_iz
;
375 int *g2tx
, *g2ty
, *g2tz
;
377 int *thread_idx
= NULL
;
378 thread_plist_t
*tpl
= NULL
;
386 start_ix
= pme
->pmegrid_start_ix
;
387 start_iy
= pme
->pmegrid_start_iy
;
388 start_iz
= pme
->pmegrid_start_iz
;
390 rxx
= pme
->recipbox
[XX
][XX
];
391 ryx
= pme
->recipbox
[YY
][XX
];
392 ryy
= pme
->recipbox
[YY
][YY
];
393 rzx
= pme
->recipbox
[ZZ
][XX
];
394 rzy
= pme
->recipbox
[ZZ
][YY
];
395 rzz
= pme
->recipbox
[ZZ
][ZZ
];
397 g2tx
= pme
->pmegridA
.g2t
[XX
];
398 g2ty
= pme
->pmegridA
.g2t
[YY
];
399 g2tz
= pme
->pmegridA
.g2t
[ZZ
];
401 bThreads
= (atc
->nthread
> 1);
404 thread_idx
= atc
->thread_idx
;
406 tpl
= &atc
->thread_plist
[thread
];
408 for (i
= 0; i
< atc
->nthread
; i
++)
414 for (i
= start
; i
< end
; i
++)
417 idxptr
= atc
->idx
[i
];
418 fptr
= atc
->fractx
[i
];
420 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
421 tx
= nx
* ( xptr
[XX
] * rxx
+ xptr
[YY
] * ryx
+ xptr
[ZZ
] * rzx
+ 2.0 );
422 ty
= ny
* ( xptr
[YY
] * ryy
+ xptr
[ZZ
] * rzy
+ 2.0 );
423 tz
= nz
* ( xptr
[ZZ
] * rzz
+ 2.0 );
429 /* Because decomposition only occurs in x and y,
430 * we never have a fraction correction in z.
432 fptr
[XX
] = tx
- tix
+ pme
->fshx
[tix
];
433 fptr
[YY
] = ty
- tiy
+ pme
->fshy
[tiy
];
436 idxptr
[XX
] = pme
->nnx
[tix
];
437 idxptr
[YY
] = pme
->nny
[tiy
];
438 idxptr
[ZZ
] = pme
->nnz
[tiz
];
441 range_check(idxptr
[XX
], 0, pme
->pmegrid_nx
);
442 range_check(idxptr
[YY
], 0, pme
->pmegrid_ny
);
443 range_check(idxptr
[ZZ
], 0, pme
->pmegrid_nz
);
448 thread_i
= g2tx
[idxptr
[XX
]] + g2ty
[idxptr
[YY
]] + g2tz
[idxptr
[ZZ
]];
449 thread_idx
[i
] = thread_i
;
456 /* Make a list of particle indices sorted on thread */
458 /* Get the cumulative count */
459 for (i
= 1; i
< atc
->nthread
; i
++)
461 tpl_n
[i
] += tpl_n
[i
-1];
463 /* The current implementation distributes particles equally
464 * over the threads, so we could actually allocate for that
465 * in pme_realloc_atomcomm_things.
467 if (tpl_n
[atc
->nthread
-1] > tpl
->nalloc
)
469 tpl
->nalloc
= over_alloc_large(tpl_n
[atc
->nthread
-1]);
470 srenew(tpl
->i
, tpl
->nalloc
);
472 /* Set tpl_n to the cumulative start */
473 for (i
= atc
->nthread
-1; i
>= 1; i
--)
475 tpl_n
[i
] = tpl_n
[i
-1];
479 /* Fill our thread local array with indices sorted on thread */
480 for (i
= start
; i
< end
; i
++)
482 tpl
->i
[tpl_n
[atc
->thread_idx
[i
]]++] = i
;
484 /* Now tpl_n contains the cummulative count again */
488 static void make_thread_local_ind(pme_atomcomm_t
*atc
,
489 int thread
, splinedata_t
*spline
)
491 int n
, t
, i
, start
, end
;
494 /* Combine the indices made by each thread into one index */
498 for (t
= 0; t
< atc
->nthread
; t
++)
500 tpl
= &atc
->thread_plist
[t
];
501 /* Copy our part (start - end) from the list of thread t */
504 start
= tpl
->n
[thread
-1];
506 end
= tpl
->n
[thread
];
507 for (i
= start
; i
< end
; i
++)
509 spline
->ind
[n
++] = tpl
->i
[i
];
517 static void pme_calc_pidx(int start
, int end
,
518 matrix recipbox
, rvec x
[],
519 pme_atomcomm_t
*atc
, int *count
)
524 real rxx
, ryx
, rzx
, ryy
, rzy
;
527 /* Calculate PME task index (pidx) for each grid index.
528 * Here we always assign equally sized slabs to each node
529 * for load balancing reasons (the PME grid spacing is not used).
535 /* Reset the count */
536 for (i
= 0; i
< nslab
; i
++)
541 if (atc
->dimind
== 0)
543 rxx
= recipbox
[XX
][XX
];
544 ryx
= recipbox
[YY
][XX
];
545 rzx
= recipbox
[ZZ
][XX
];
546 /* Calculate the node index in x-dimension */
547 for (i
= start
; i
< end
; i
++)
550 /* Fractional coordinates along box vectors */
551 s
= nslab
*(xptr
[XX
]*rxx
+ xptr
[YY
]*ryx
+ xptr
[ZZ
]*rzx
);
552 si
= (int)(s
+ 2*nslab
) % nslab
;
559 ryy
= recipbox
[YY
][YY
];
560 rzy
= recipbox
[ZZ
][YY
];
561 /* Calculate the node index in y-dimension */
562 for (i
= start
; i
< end
; i
++)
565 /* Fractional coordinates along box vectors */
566 s
= nslab
*(xptr
[YY
]*ryy
+ xptr
[ZZ
]*rzy
);
567 si
= (int)(s
+ 2*nslab
) % nslab
;
574 static void pme_calc_pidx_wrapper(int natoms
, matrix recipbox
, rvec x
[],
577 int nthread
, thread
, slab
;
579 nthread
= atc
->nthread
;
581 #pragma omp parallel for num_threads(nthread) schedule(static)
582 for (thread
= 0; thread
< nthread
; thread
++)
584 pme_calc_pidx(natoms
* thread
/nthread
,
585 natoms
*(thread
+1)/nthread
,
586 recipbox
, x
, atc
, atc
->count_thread
[thread
]);
588 /* Non-parallel reduction, since nslab is small */
590 for (thread
= 1; thread
< nthread
; thread
++)
592 for (slab
= 0; slab
< atc
->nslab
; slab
++)
594 atc
->count_thread
[0][slab
] += atc
->count_thread
[thread
][slab
];
599 static void realloc_splinevec(splinevec th
, real
**ptr_z
, int nalloc
)
601 const int padding
= 4;
604 srenew(th
[XX
], nalloc
);
605 srenew(th
[YY
], nalloc
);
606 /* In z we add padding, this is only required for the aligned SIMD code */
607 sfree_aligned(*ptr_z
);
608 snew_aligned(*ptr_z
, nalloc
+2*padding
, SIMD4_ALIGNMENT
);
609 th
[ZZ
] = *ptr_z
+ padding
;
611 for (i
= 0; i
< padding
; i
++)
614 (*ptr_z
)[padding
+nalloc
+i
] = 0;
618 static void pme_realloc_splinedata(splinedata_t
*spline
, pme_atomcomm_t
*atc
)
622 srenew(spline
->ind
, atc
->nalloc
);
623 /* Initialize the index to identity so it works without threads */
624 for (i
= 0; i
< atc
->nalloc
; i
++)
629 realloc_splinevec(spline
->theta
, &spline
->ptr_theta_z
,
630 atc
->pme_order
*atc
->nalloc
);
631 realloc_splinevec(spline
->dtheta
, &spline
->ptr_dtheta_z
,
632 atc
->pme_order
*atc
->nalloc
);
635 static void pme_realloc_atomcomm_things(pme_atomcomm_t
*atc
)
637 int nalloc_old
, i
, j
, nalloc_tpl
;
639 /* We have to avoid a NULL pointer for atc->x to avoid
640 * possible fatal errors in MPI routines.
642 if (atc
->n
> atc
->nalloc
|| atc
->nalloc
== 0)
644 nalloc_old
= atc
->nalloc
;
645 atc
->nalloc
= over_alloc_dd(max(atc
->n
, 1));
649 srenew(atc
->x
, atc
->nalloc
);
650 srenew(atc
->q
, atc
->nalloc
);
651 srenew(atc
->f
, atc
->nalloc
);
652 for (i
= nalloc_old
; i
< atc
->nalloc
; i
++)
654 clear_rvec(atc
->f
[i
]);
659 srenew(atc
->fractx
, atc
->nalloc
);
660 srenew(atc
->idx
, atc
->nalloc
);
662 if (atc
->nthread
> 1)
664 srenew(atc
->thread_idx
, atc
->nalloc
);
667 for (i
= 0; i
< atc
->nthread
; i
++)
669 pme_realloc_splinedata(&atc
->spline
[i
], atc
);
675 static void pmeredist_pd(gmx_pme_t pme
, gmx_bool forw
,
676 int n
, gmx_bool bXF
, rvec
*x_f
, real
*charge
,
678 /* Redistribute particle data for PME calculation */
679 /* domain decomposition by x coordinate */
684 if (FALSE
== pme
->redist_init
)
686 snew(pme
->scounts
, atc
->nslab
);
687 snew(pme
->rcounts
, atc
->nslab
);
688 snew(pme
->sdispls
, atc
->nslab
);
689 snew(pme
->rdispls
, atc
->nslab
);
690 snew(pme
->sidx
, atc
->nslab
);
691 pme
->redist_init
= TRUE
;
693 if (n
> pme
->redist_buf_nalloc
)
695 pme
->redist_buf_nalloc
= over_alloc_dd(n
);
696 srenew(pme
->redist_buf
, pme
->redist_buf_nalloc
*DIM
);
704 /* forward, redistribution from pp to pme */
706 /* Calculate send counts and exchange them with other nodes */
707 for (i
= 0; (i
< atc
->nslab
); i
++)
711 for (i
= 0; (i
< n
); i
++)
713 pme
->scounts
[pme
->idxa
[i
]]++;
715 MPI_Alltoall( pme
->scounts
, 1, MPI_INT
, pme
->rcounts
, 1, MPI_INT
, atc
->mpi_comm
);
717 /* Calculate send and receive displacements and index into send
722 for (i
= 1; i
< atc
->nslab
; i
++)
724 pme
->sdispls
[i
] = pme
->sdispls
[i
-1]+pme
->scounts
[i
-1];
725 pme
->rdispls
[i
] = pme
->rdispls
[i
-1]+pme
->rcounts
[i
-1];
726 pme
->sidx
[i
] = pme
->sdispls
[i
];
728 /* Total # of particles to be received */
729 atc
->n
= pme
->rdispls
[atc
->nslab
-1] + pme
->rcounts
[atc
->nslab
-1];
731 pme_realloc_atomcomm_things(atc
);
733 /* Copy particle coordinates into send buffer and exchange*/
734 for (i
= 0; (i
< n
); i
++)
736 ii
= DIM
*pme
->sidx
[pme
->idxa
[i
]];
737 pme
->sidx
[pme
->idxa
[i
]]++;
738 pme
->redist_buf
[ii
+XX
] = x_f
[i
][XX
];
739 pme
->redist_buf
[ii
+YY
] = x_f
[i
][YY
];
740 pme
->redist_buf
[ii
+ZZ
] = x_f
[i
][ZZ
];
742 MPI_Alltoallv(pme
->redist_buf
, pme
->scounts
, pme
->sdispls
,
743 pme
->rvec_mpi
, atc
->x
, pme
->rcounts
, pme
->rdispls
,
744 pme
->rvec_mpi
, atc
->mpi_comm
);
748 /* Copy charge into send buffer and exchange*/
749 for (i
= 0; i
< atc
->nslab
; i
++)
751 pme
->sidx
[i
] = pme
->sdispls
[i
];
753 for (i
= 0; (i
< n
); i
++)
755 ii
= pme
->sidx
[pme
->idxa
[i
]];
756 pme
->sidx
[pme
->idxa
[i
]]++;
757 pme
->redist_buf
[ii
] = charge
[i
];
759 MPI_Alltoallv(pme
->redist_buf
, pme
->scounts
, pme
->sdispls
, mpi_type
,
760 atc
->q
, pme
->rcounts
, pme
->rdispls
, mpi_type
,
763 else /* backward, redistribution from pme to pp */
765 MPI_Alltoallv(atc
->f
, pme
->rcounts
, pme
->rdispls
, pme
->rvec_mpi
,
766 pme
->redist_buf
, pme
->scounts
, pme
->sdispls
,
767 pme
->rvec_mpi
, atc
->mpi_comm
);
769 /* Copy data from receive buffer */
770 for (i
= 0; i
< atc
->nslab
; i
++)
772 pme
->sidx
[i
] = pme
->sdispls
[i
];
774 for (i
= 0; (i
< n
); i
++)
776 ii
= DIM
*pme
->sidx
[pme
->idxa
[i
]];
777 x_f
[i
][XX
] += pme
->redist_buf
[ii
+XX
];
778 x_f
[i
][YY
] += pme
->redist_buf
[ii
+YY
];
779 x_f
[i
][ZZ
] += pme
->redist_buf
[ii
+ZZ
];
780 pme
->sidx
[pme
->idxa
[i
]]++;
786 static void pme_dd_sendrecv(pme_atomcomm_t
*atc
,
787 gmx_bool bBackward
, int shift
,
788 void *buf_s
, int nbyte_s
,
789 void *buf_r
, int nbyte_r
)
795 if (bBackward
== FALSE
)
797 dest
= atc
->node_dest
[shift
];
798 src
= atc
->node_src
[shift
];
802 dest
= atc
->node_src
[shift
];
803 src
= atc
->node_dest
[shift
];
806 if (nbyte_s
> 0 && nbyte_r
> 0)
808 MPI_Sendrecv(buf_s
, nbyte_s
, MPI_BYTE
,
810 buf_r
, nbyte_r
, MPI_BYTE
,
812 atc
->mpi_comm
, &stat
);
814 else if (nbyte_s
> 0)
816 MPI_Send(buf_s
, nbyte_s
, MPI_BYTE
,
820 else if (nbyte_r
> 0)
822 MPI_Recv(buf_r
, nbyte_r
, MPI_BYTE
,
824 atc
->mpi_comm
, &stat
);
829 static void dd_pmeredist_x_q(gmx_pme_t pme
,
830 int n
, gmx_bool bX
, rvec
*x
, real
*charge
,
833 int *commnode
, *buf_index
;
834 int nnodes_comm
, i
, nsend
, local_pos
, buf_pos
, node
, scount
, rcount
;
836 commnode
= atc
->node_dest
;
837 buf_index
= atc
->buf_index
;
839 nnodes_comm
= min(2*atc
->maxshift
, atc
->nslab
-1);
842 for (i
= 0; i
< nnodes_comm
; i
++)
844 buf_index
[commnode
[i
]] = nsend
;
845 nsend
+= atc
->count
[commnode
[i
]];
849 if (atc
->count
[atc
->nodeid
] + nsend
!= n
)
851 gmx_fatal(FARGS
, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
852 "This usually means that your system is not well equilibrated.",
853 n
- (atc
->count
[atc
->nodeid
] + nsend
),
854 pme
->nodeid
, 'x'+atc
->dimind
);
857 if (nsend
> pme
->buf_nalloc
)
859 pme
->buf_nalloc
= over_alloc_dd(nsend
);
860 srenew(pme
->bufv
, pme
->buf_nalloc
);
861 srenew(pme
->bufr
, pme
->buf_nalloc
);
864 atc
->n
= atc
->count
[atc
->nodeid
];
865 for (i
= 0; i
< nnodes_comm
; i
++)
867 scount
= atc
->count
[commnode
[i
]];
868 /* Communicate the count */
871 fprintf(debug
, "dimind %d PME node %d send to node %d: %d\n",
872 atc
->dimind
, atc
->nodeid
, commnode
[i
], scount
);
874 pme_dd_sendrecv(atc
, FALSE
, i
,
875 &scount
, sizeof(int),
876 &atc
->rcount
[i
], sizeof(int));
877 atc
->n
+= atc
->rcount
[i
];
880 pme_realloc_atomcomm_things(atc
);
884 for (i
= 0; i
< n
; i
++)
887 if (node
== atc
->nodeid
)
889 /* Copy direct to the receive buffer */
892 copy_rvec(x
[i
], atc
->x
[local_pos
]);
894 atc
->q
[local_pos
] = charge
[i
];
899 /* Copy to the send buffer */
902 copy_rvec(x
[i
], pme
->bufv
[buf_index
[node
]]);
904 pme
->bufr
[buf_index
[node
]] = charge
[i
];
910 for (i
= 0; i
< nnodes_comm
; i
++)
912 scount
= atc
->count
[commnode
[i
]];
913 rcount
= atc
->rcount
[i
];
914 if (scount
> 0 || rcount
> 0)
918 /* Communicate the coordinates */
919 pme_dd_sendrecv(atc
, FALSE
, i
,
920 pme
->bufv
[buf_pos
], scount
*sizeof(rvec
),
921 atc
->x
[local_pos
], rcount
*sizeof(rvec
));
923 /* Communicate the charges */
924 pme_dd_sendrecv(atc
, FALSE
, i
,
925 pme
->bufr
+buf_pos
, scount
*sizeof(real
),
926 atc
->q
+local_pos
, rcount
*sizeof(real
));
928 local_pos
+= atc
->rcount
[i
];
933 static void dd_pmeredist_f(gmx_pme_t pme
, pme_atomcomm_t
*atc
,
937 int *commnode
, *buf_index
;
938 int nnodes_comm
, local_pos
, buf_pos
, i
, scount
, rcount
, node
;
940 commnode
= atc
->node_dest
;
941 buf_index
= atc
->buf_index
;
943 nnodes_comm
= min(2*atc
->maxshift
, atc
->nslab
-1);
945 local_pos
= atc
->count
[atc
->nodeid
];
947 for (i
= 0; i
< nnodes_comm
; i
++)
949 scount
= atc
->rcount
[i
];
950 rcount
= atc
->count
[commnode
[i
]];
951 if (scount
> 0 || rcount
> 0)
953 /* Communicate the forces */
954 pme_dd_sendrecv(atc
, TRUE
, i
,
955 atc
->f
[local_pos
], scount
*sizeof(rvec
),
956 pme
->bufv
[buf_pos
], rcount
*sizeof(rvec
));
959 buf_index
[commnode
[i
]] = buf_pos
;
966 for (i
= 0; i
< n
; i
++)
969 if (node
== atc
->nodeid
)
971 /* Add from the local force array */
972 rvec_inc(f
[i
], atc
->f
[local_pos
]);
977 /* Add from the receive buffer */
978 rvec_inc(f
[i
], pme
->bufv
[buf_index
[node
]]);
985 for (i
= 0; i
< n
; i
++)
988 if (node
== atc
->nodeid
)
990 /* Copy from the local force array */
991 copy_rvec(atc
->f
[local_pos
], f
[i
]);
996 /* Copy from the receive buffer */
997 copy_rvec(pme
->bufv
[buf_index
[node
]], f
[i
]);
1006 gmx_sum_qgrid_dd(gmx_pme_t pme
, real
*grid
, int direction
)
1008 pme_overlap_t
*overlap
;
1009 int send_index0
, send_nindex
;
1010 int recv_index0
, recv_nindex
;
1012 int i
, j
, k
, ix
, iy
, iz
, icnt
;
1013 int ipulse
, send_id
, recv_id
, datasize
;
1015 real
*sendptr
, *recvptr
;
1017 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
1018 overlap
= &pme
->overlap
[1];
1020 for (ipulse
= 0; ipulse
< overlap
->noverlap_nodes
; ipulse
++)
1022 /* Since we have already (un)wrapped the overlap in the z-dimension,
1023 * we only have to communicate 0 to nkz (not pmegrid_nz).
1025 if (direction
== GMX_SUM_QGRID_FORWARD
)
1027 send_id
= overlap
->send_id
[ipulse
];
1028 recv_id
= overlap
->recv_id
[ipulse
];
1029 send_index0
= overlap
->comm_data
[ipulse
].send_index0
;
1030 send_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
1031 recv_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
1032 recv_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
1036 send_id
= overlap
->recv_id
[ipulse
];
1037 recv_id
= overlap
->send_id
[ipulse
];
1038 send_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
1039 send_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
1040 recv_index0
= overlap
->comm_data
[ipulse
].send_index0
;
1041 recv_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
1044 /* Copy data to contiguous send buffer */
1047 fprintf(debug
, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1048 pme
->nodeid
, overlap
->nodeid
, send_id
,
1049 pme
->pmegrid_start_iy
,
1050 send_index0
-pme
->pmegrid_start_iy
,
1051 send_index0
-pme
->pmegrid_start_iy
+send_nindex
);
1054 for (i
= 0; i
< pme
->pmegrid_nx
; i
++)
1057 for (j
= 0; j
< send_nindex
; j
++)
1059 iy
= j
+ send_index0
- pme
->pmegrid_start_iy
;
1060 for (k
= 0; k
< pme
->nkz
; k
++)
1063 overlap
->sendbuf
[icnt
++] = grid
[ix
*(pme
->pmegrid_ny
*pme
->pmegrid_nz
)+iy
*(pme
->pmegrid_nz
)+iz
];
1068 datasize
= pme
->pmegrid_nx
* pme
->nkz
;
1070 MPI_Sendrecv(overlap
->sendbuf
, send_nindex
*datasize
, GMX_MPI_REAL
,
1072 overlap
->recvbuf
, recv_nindex
*datasize
, GMX_MPI_REAL
,
1074 overlap
->mpi_comm
, &stat
);
1076 /* Get data from contiguous recv buffer */
1079 fprintf(debug
, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1080 pme
->nodeid
, overlap
->nodeid
, recv_id
,
1081 pme
->pmegrid_start_iy
,
1082 recv_index0
-pme
->pmegrid_start_iy
,
1083 recv_index0
-pme
->pmegrid_start_iy
+recv_nindex
);
1086 for (i
= 0; i
< pme
->pmegrid_nx
; i
++)
1089 for (j
= 0; j
< recv_nindex
; j
++)
1091 iy
= j
+ recv_index0
- pme
->pmegrid_start_iy
;
1092 for (k
= 0; k
< pme
->nkz
; k
++)
1095 if (direction
== GMX_SUM_QGRID_FORWARD
)
1097 grid
[ix
*(pme
->pmegrid_ny
*pme
->pmegrid_nz
)+iy
*(pme
->pmegrid_nz
)+iz
] += overlap
->recvbuf
[icnt
++];
1101 grid
[ix
*(pme
->pmegrid_ny
*pme
->pmegrid_nz
)+iy
*(pme
->pmegrid_nz
)+iz
] = overlap
->recvbuf
[icnt
++];
1108 /* Major dimension is easier, no copying required,
1109 * but we might have to sum to separate array.
1110 * Since we don't copy, we have to communicate up to pmegrid_nz,
1111 * not nkz as for the minor direction.
1113 overlap
= &pme
->overlap
[0];
1115 for (ipulse
= 0; ipulse
< overlap
->noverlap_nodes
; ipulse
++)
1117 if (direction
== GMX_SUM_QGRID_FORWARD
)
1119 send_id
= overlap
->send_id
[ipulse
];
1120 recv_id
= overlap
->recv_id
[ipulse
];
1121 send_index0
= overlap
->comm_data
[ipulse
].send_index0
;
1122 send_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
1123 recv_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
1124 recv_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
1125 recvptr
= overlap
->recvbuf
;
1129 send_id
= overlap
->recv_id
[ipulse
];
1130 recv_id
= overlap
->send_id
[ipulse
];
1131 send_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
1132 send_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
1133 recv_index0
= overlap
->comm_data
[ipulse
].send_index0
;
1134 recv_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
1135 recvptr
= grid
+ (recv_index0
-pme
->pmegrid_start_ix
)*(pme
->pmegrid_ny
*pme
->pmegrid_nz
);
1138 sendptr
= grid
+ (send_index0
-pme
->pmegrid_start_ix
)*(pme
->pmegrid_ny
*pme
->pmegrid_nz
);
1139 datasize
= pme
->pmegrid_ny
* pme
->pmegrid_nz
;
1143 fprintf(debug
, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1144 pme
->nodeid
, overlap
->nodeid
, send_id
,
1145 pme
->pmegrid_start_ix
,
1146 send_index0
-pme
->pmegrid_start_ix
,
1147 send_index0
-pme
->pmegrid_start_ix
+send_nindex
);
1148 fprintf(debug
, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1149 pme
->nodeid
, overlap
->nodeid
, recv_id
,
1150 pme
->pmegrid_start_ix
,
1151 recv_index0
-pme
->pmegrid_start_ix
,
1152 recv_index0
-pme
->pmegrid_start_ix
+recv_nindex
);
1155 MPI_Sendrecv(sendptr
, send_nindex
*datasize
, GMX_MPI_REAL
,
1157 recvptr
, recv_nindex
*datasize
, GMX_MPI_REAL
,
1159 overlap
->mpi_comm
, &stat
);
1161 /* ADD data from contiguous recv buffer */
1162 if (direction
== GMX_SUM_QGRID_FORWARD
)
1164 p
= grid
+ (recv_index0
-pme
->pmegrid_start_ix
)*(pme
->pmegrid_ny
*pme
->pmegrid_nz
);
1165 for (i
= 0; i
< recv_nindex
*datasize
; i
++)
1167 p
[i
] += overlap
->recvbuf
[i
];
1176 copy_pmegrid_to_fftgrid(gmx_pme_t pme
, real
*pmegrid
, real
*fftgrid
)
1178 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
1179 ivec local_pme_size
;
1183 /* Dimensions should be identical for A/B grid, so we just use A here */
1184 gmx_parallel_3dfft_real_limits(pme
->pfft_setupA
,
1189 local_pme_size
[0] = pme
->pmegrid_nx
;
1190 local_pme_size
[1] = pme
->pmegrid_ny
;
1191 local_pme_size
[2] = pme
->pmegrid_nz
;
1193 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1194 the offset is identical, and the PME grid always has more data (due to overlap)
1199 char fn
[STRLEN
], format
[STRLEN
];
1201 sprintf(fn
, "pmegrid%d.pdb", pme
->nodeid
);
1202 fp
= ffopen(fn
, "w");
1203 sprintf(fn
, "pmegrid%d.txt", pme
->nodeid
);
1204 fp2
= ffopen(fn
, "w");
1205 sprintf(format
, "%s%s\n", pdbformat
, "%6.2f%6.2f");
1208 for (ix
= 0; ix
< local_fft_ndata
[XX
]; ix
++)
1210 for (iy
= 0; iy
< local_fft_ndata
[YY
]; iy
++)
1212 for (iz
= 0; iz
< local_fft_ndata
[ZZ
]; iz
++)
1214 pmeidx
= ix
*(local_pme_size
[YY
]*local_pme_size
[ZZ
])+iy
*(local_pme_size
[ZZ
])+iz
;
1215 fftidx
= ix
*(local_fft_size
[YY
]*local_fft_size
[ZZ
])+iy
*(local_fft_size
[ZZ
])+iz
;
1216 fftgrid
[fftidx
] = pmegrid
[pmeidx
];
1218 val
= 100*pmegrid
[pmeidx
];
1219 if (pmegrid
[pmeidx
] != 0)
1221 fprintf(fp
, format
, "ATOM", pmeidx
, "CA", "GLY", ' ', pmeidx
, ' ',
1222 5.0*ix
, 5.0*iy
, 5.0*iz
, 1.0, val
);
1224 if (pmegrid
[pmeidx
] != 0)
1226 fprintf(fp2
, "%-12s %5d %5d %5d %12.5e\n",
1228 pme
->pmegrid_start_ix
+ ix
,
1229 pme
->pmegrid_start_iy
+ iy
,
1230 pme
->pmegrid_start_iz
+ iz
,
1246 static gmx_cycles_t
omp_cyc_start()
1248 return gmx_cycles_read();
1251 static gmx_cycles_t
omp_cyc_end(gmx_cycles_t c
)
1253 return gmx_cycles_read() - c
;
1258 copy_fftgrid_to_pmegrid(gmx_pme_t pme
, const real
*fftgrid
, real
*pmegrid
,
1259 int nthread
, int thread
)
1261 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
1262 ivec local_pme_size
;
1263 int ixy0
, ixy1
, ixy
, ix
, iy
, iz
;
1265 #ifdef PME_TIME_THREADS
1267 static double cs1
= 0;
1271 #ifdef PME_TIME_THREADS
1272 c1
= omp_cyc_start();
1274 /* Dimensions should be identical for A/B grid, so we just use A here */
1275 gmx_parallel_3dfft_real_limits(pme
->pfft_setupA
,
1280 local_pme_size
[0] = pme
->pmegrid_nx
;
1281 local_pme_size
[1] = pme
->pmegrid_ny
;
1282 local_pme_size
[2] = pme
->pmegrid_nz
;
1284 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1285 the offset is identical, and the PME grid always has more data (due to overlap)
1287 ixy0
= ((thread
)*local_fft_ndata
[XX
]*local_fft_ndata
[YY
])/nthread
;
1288 ixy1
= ((thread
+1)*local_fft_ndata
[XX
]*local_fft_ndata
[YY
])/nthread
;
1290 for (ixy
= ixy0
; ixy
< ixy1
; ixy
++)
1292 ix
= ixy
/local_fft_ndata
[YY
];
1293 iy
= ixy
- ix
*local_fft_ndata
[YY
];
1295 pmeidx
= (ix
*local_pme_size
[YY
] + iy
)*local_pme_size
[ZZ
];
1296 fftidx
= (ix
*local_fft_size
[YY
] + iy
)*local_fft_size
[ZZ
];
1297 for (iz
= 0; iz
< local_fft_ndata
[ZZ
]; iz
++)
1299 pmegrid
[pmeidx
+iz
] = fftgrid
[fftidx
+iz
];
1303 #ifdef PME_TIME_THREADS
1304 c1
= omp_cyc_end(c1
);
1309 printf("copy %.2f\n", cs1
*1e-9);
1318 wrap_periodic_pmegrid(gmx_pme_t pme
, real
*pmegrid
)
1320 int nx
, ny
, nz
, pnx
, pny
, pnz
, ny_x
, overlap
, ix
, iy
, iz
;
1326 pnx
= pme
->pmegrid_nx
;
1327 pny
= pme
->pmegrid_ny
;
1328 pnz
= pme
->pmegrid_nz
;
1330 overlap
= pme
->pme_order
- 1;
1332 /* Add periodic overlap in z */
1333 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
1335 for (iy
= 0; iy
< pme
->pmegrid_ny
; iy
++)
1337 for (iz
= 0; iz
< overlap
; iz
++)
1339 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
] +=
1340 pmegrid
[(ix
*pny
+iy
)*pnz
+nz
+iz
];
1345 if (pme
->nnodes_minor
== 1)
1347 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
1349 for (iy
= 0; iy
< overlap
; iy
++)
1351 for (iz
= 0; iz
< nz
; iz
++)
1353 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
] +=
1354 pmegrid
[(ix
*pny
+ny
+iy
)*pnz
+iz
];
1360 if (pme
->nnodes_major
== 1)
1362 ny_x
= (pme
->nnodes_minor
== 1 ? ny
: pme
->pmegrid_ny
);
1364 for (ix
= 0; ix
< overlap
; ix
++)
1366 for (iy
= 0; iy
< ny_x
; iy
++)
1368 for (iz
= 0; iz
< nz
; iz
++)
1370 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
] +=
1371 pmegrid
[((nx
+ix
)*pny
+iy
)*pnz
+iz
];
1380 unwrap_periodic_pmegrid(gmx_pme_t pme
, real
*pmegrid
)
1382 int nx
, ny
, nz
, pnx
, pny
, pnz
, ny_x
, overlap
, ix
;
1388 pnx
= pme
->pmegrid_nx
;
1389 pny
= pme
->pmegrid_ny
;
1390 pnz
= pme
->pmegrid_nz
;
1392 overlap
= pme
->pme_order
- 1;
1394 if (pme
->nnodes_major
== 1)
1396 ny_x
= (pme
->nnodes_minor
== 1 ? ny
: pme
->pmegrid_ny
);
1398 for (ix
= 0; ix
< overlap
; ix
++)
1402 for (iy
= 0; iy
< ny_x
; iy
++)
1404 for (iz
= 0; iz
< nz
; iz
++)
1406 pmegrid
[((nx
+ix
)*pny
+iy
)*pnz
+iz
] =
1407 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
];
1413 if (pme
->nnodes_minor
== 1)
1415 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1416 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
1420 for (iy
= 0; iy
< overlap
; iy
++)
1422 for (iz
= 0; iz
< nz
; iz
++)
1424 pmegrid
[(ix
*pny
+ny
+iy
)*pnz
+iz
] =
1425 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
];
1431 /* Copy periodic overlap in z */
1432 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1433 for (ix
= 0; ix
< pme
->pmegrid_nx
; ix
++)
1437 for (iy
= 0; iy
< pme
->pmegrid_ny
; iy
++)
1439 for (iz
= 0; iz
< overlap
; iz
++)
1441 pmegrid
[(ix
*pny
+iy
)*pnz
+nz
+iz
] =
1442 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
];
1448 static void clear_grid(int nx
, int ny
, int nz
, real
*grid
,
1450 int fx
, int fy
, int fz
,
1454 int fsx
, fsy
, fsz
, gx
, gy
, gz
, g0x
, g0y
, x
, y
, z
;
1457 nc
= 2 + (order
- 2)/FLBS
;
1458 ncz
= 2 + (order
- 2)/FLBSZ
;
1460 for (fsx
= fx
; fsx
< fx
+nc
; fsx
++)
1462 for (fsy
= fy
; fsy
< fy
+nc
; fsy
++)
1464 for (fsz
= fz
; fsz
< fz
+ncz
; fsz
++)
1466 flind
= (fsx
*fs
[YY
] + fsy
)*fs
[ZZ
] + fsz
;
1467 if (flag
[flind
] == 0)
1472 g0x
= (gx
*ny
+ gy
)*nz
+ gz
;
1473 for (x
= 0; x
< FLBS
; x
++)
1476 for (y
= 0; y
< FLBS
; y
++)
1478 for (z
= 0; z
< FLBSZ
; z
++)
1494 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1495 #define DO_BSPLINE(order) \
1496 for (ithx = 0; (ithx < order); ithx++) \
1498 index_x = (i0+ithx)*pny*pnz; \
1499 valx = qn*thx[ithx]; \
1501 for (ithy = 0; (ithy < order); ithy++) \
1503 valxy = valx*thy[ithy]; \
1504 index_xy = index_x+(j0+ithy)*pnz; \
1506 for (ithz = 0; (ithz < order); ithz++) \
1508 index_xyz = index_xy+(k0+ithz); \
1509 grid[index_xyz] += valxy*thz[ithz]; \
1515 static void spread_q_bsplines_thread(pmegrid_t
*pmegrid
,
1516 pme_atomcomm_t
*atc
, splinedata_t
*spline
,
1517 pme_spline_work_t
*work
)
1520 /* spread charges from home atoms to local grid */
1523 int b
, i
, nn
, n
, ithx
, ithy
, ithz
, i0
, j0
, k0
;
1525 int order
, norder
, index_x
, index_xy
, index_xyz
;
1526 real valx
, valxy
, qn
;
1527 real
*thx
, *thy
, *thz
;
1528 int localsize
, bndsize
;
1529 int pnx
, pny
, pnz
, ndatatot
;
1530 int offx
, offy
, offz
;
1532 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
1533 real thz_buffer
[12], *thz_aligned
;
1535 thz_aligned
= gmx_simd4_align_real(thz_buffer
);
1538 pnx
= pmegrid
->s
[XX
];
1539 pny
= pmegrid
->s
[YY
];
1540 pnz
= pmegrid
->s
[ZZ
];
1542 offx
= pmegrid
->offset
[XX
];
1543 offy
= pmegrid
->offset
[YY
];
1544 offz
= pmegrid
->offset
[ZZ
];
1546 ndatatot
= pnx
*pny
*pnz
;
1547 grid
= pmegrid
->grid
;
1548 for (i
= 0; i
< ndatatot
; i
++)
1553 order
= pmegrid
->order
;
1555 for (nn
= 0; nn
< spline
->n
; nn
++)
1557 n
= spline
->ind
[nn
];
1562 idxptr
= atc
->idx
[n
];
1565 i0
= idxptr
[XX
] - offx
;
1566 j0
= idxptr
[YY
] - offy
;
1567 k0
= idxptr
[ZZ
] - offz
;
1569 thx
= spline
->theta
[XX
] + norder
;
1570 thy
= spline
->theta
[YY
] + norder
;
1571 thz
= spline
->theta
[ZZ
] + norder
;
1576 #ifdef PME_SIMD4_SPREAD_GATHER
1577 #ifdef PME_SIMD4_UNALIGNED
1578 #define PME_SPREAD_SIMD4_ORDER4
1580 #define PME_SPREAD_SIMD4_ALIGNED
1583 #include "pme_simd4.h"
1589 #ifdef PME_SIMD4_SPREAD_GATHER
1590 #define PME_SPREAD_SIMD4_ALIGNED
1592 #include "pme_simd4.h"
1605 static void set_grid_alignment(int *pmegrid_nz
, int pme_order
)
1607 #ifdef PME_SIMD4_SPREAD_GATHER
1609 #ifndef PME_SIMD4_UNALIGNED
1614 /* Round nz up to a multiple of 4 to ensure alignment */
1615 *pmegrid_nz
= ((*pmegrid_nz
+ 3) & ~3);
1620 static void set_gridsize_alignment(int *gridsize
, int pme_order
)
1622 #ifdef PME_SIMD4_SPREAD_GATHER
1623 #ifndef PME_SIMD4_UNALIGNED
1626 /* Add extra elements to ensured aligned operations do not go
1627 * beyond the allocated grid size.
1628 * Note that for pme_order=5, the pme grid z-size alignment
1629 * ensures that we will not go beyond the grid size.
1637 static void pmegrid_init(pmegrid_t
*grid
,
1638 int cx
, int cy
, int cz
,
1639 int x0
, int y0
, int z0
,
1640 int x1
, int y1
, int z1
,
1641 gmx_bool set_alignment
,
1650 grid
->offset
[XX
] = x0
;
1651 grid
->offset
[YY
] = y0
;
1652 grid
->offset
[ZZ
] = z0
;
1653 grid
->n
[XX
] = x1
- x0
+ pme_order
- 1;
1654 grid
->n
[YY
] = y1
- y0
+ pme_order
- 1;
1655 grid
->n
[ZZ
] = z1
- z0
+ pme_order
- 1;
1656 copy_ivec(grid
->n
, grid
->s
);
1659 set_grid_alignment(&nz
, pme_order
);
1664 else if (nz
!= grid
->s
[ZZ
])
1666 gmx_incons("pmegrid_init call with an unaligned z size");
1669 grid
->order
= pme_order
;
1672 gridsize
= grid
->s
[XX
]*grid
->s
[YY
]*grid
->s
[ZZ
];
1673 set_gridsize_alignment(&gridsize
, pme_order
);
1674 snew_aligned(grid
->grid
, gridsize
, SIMD4_ALIGNMENT
);
1682 static int div_round_up(int enumerator
, int denominator
)
1684 return (enumerator
+ denominator
- 1)/denominator
;
1687 static void make_subgrid_division(const ivec n
, int ovl
, int nthread
,
1690 int gsize_opt
, gsize
;
1695 for (nsx
= 1; nsx
<= nthread
; nsx
++)
1697 if (nthread
% nsx
== 0)
1699 for (nsy
= 1; nsy
<= nthread
; nsy
++)
1701 if (nsx
*nsy
<= nthread
&& nthread
% (nsx
*nsy
) == 0)
1703 nsz
= nthread
/(nsx
*nsy
);
1705 /* Determine the number of grid points per thread */
1707 (div_round_up(n
[XX
], nsx
) + ovl
)*
1708 (div_round_up(n
[YY
], nsy
) + ovl
)*
1709 (div_round_up(n
[ZZ
], nsz
) + ovl
);
1711 /* Minimize the number of grids points per thread
1712 * and, secondarily, the number of cuts in minor dimensions.
1714 if (gsize_opt
== -1 ||
1715 gsize
< gsize_opt
||
1716 (gsize
== gsize_opt
&&
1717 (nsz
< nsub
[ZZ
] || (nsz
== nsub
[ZZ
] && nsy
< nsub
[YY
]))))
1729 env
= getenv("GMX_PME_THREAD_DIVISION");
1732 sscanf(env
, "%d %d %d", &nsub
[XX
], &nsub
[YY
], &nsub
[ZZ
]);
1735 if (nsub
[XX
]*nsub
[YY
]*nsub
[ZZ
] != nthread
)
1737 gmx_fatal(FARGS
, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub
[XX
], nsub
[YY
], nsub
[ZZ
], nthread
);
1741 static void pmegrids_init(pmegrids_t
*grids
,
1742 int nx
, int ny
, int nz
, int nz_base
,
1744 gmx_bool bUseThreads
,
1749 ivec n
, n_base
, g0
, g1
;
1750 int t
, x
, y
, z
, d
, i
, tfac
;
1751 int max_comm_lines
= -1;
1753 n
[XX
] = nx
- (pme_order
- 1);
1754 n
[YY
] = ny
- (pme_order
- 1);
1755 n
[ZZ
] = nz
- (pme_order
- 1);
1757 copy_ivec(n
, n_base
);
1758 n_base
[ZZ
] = nz_base
;
1760 pmegrid_init(&grids
->grid
, 0, 0, 0, 0, 0, 0, n
[XX
], n
[YY
], n
[ZZ
], FALSE
, pme_order
,
1763 grids
->nthread
= nthread
;
1765 make_subgrid_division(n_base
, pme_order
-1, grids
->nthread
, grids
->nc
);
1772 for (d
= 0; d
< DIM
; d
++)
1774 nst
[d
] = div_round_up(n
[d
], grids
->nc
[d
]) + pme_order
- 1;
1776 set_grid_alignment(&nst
[ZZ
], pme_order
);
1780 fprintf(debug
, "pmegrid thread local division: %d x %d x %d\n",
1781 grids
->nc
[XX
], grids
->nc
[YY
], grids
->nc
[ZZ
]);
1782 fprintf(debug
, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1784 nst
[XX
], nst
[YY
], nst
[ZZ
]);
1787 snew(grids
->grid_th
, grids
->nthread
);
1789 gridsize
= nst
[XX
]*nst
[YY
]*nst
[ZZ
];
1790 set_gridsize_alignment(&gridsize
, pme_order
);
1791 snew_aligned(grids
->grid_all
,
1792 grids
->nthread
*gridsize
+(grids
->nthread
+1)*GMX_CACHE_SEP
,
1795 for (x
= 0; x
< grids
->nc
[XX
]; x
++)
1797 for (y
= 0; y
< grids
->nc
[YY
]; y
++)
1799 for (z
= 0; z
< grids
->nc
[ZZ
]; z
++)
1801 pmegrid_init(&grids
->grid_th
[t
],
1803 (n
[XX
]*(x
))/grids
->nc
[XX
],
1804 (n
[YY
]*(y
))/grids
->nc
[YY
],
1805 (n
[ZZ
]*(z
))/grids
->nc
[ZZ
],
1806 (n
[XX
]*(x
+1))/grids
->nc
[XX
],
1807 (n
[YY
]*(y
+1))/grids
->nc
[YY
],
1808 (n
[ZZ
]*(z
+1))/grids
->nc
[ZZ
],
1811 grids
->grid_all
+GMX_CACHE_SEP
+t
*(gridsize
+GMX_CACHE_SEP
));
1819 grids
->grid_th
= NULL
;
1822 snew(grids
->g2t
, DIM
);
1824 for (d
= DIM
-1; d
>= 0; d
--)
1826 snew(grids
->g2t
[d
], n
[d
]);
1828 for (i
= 0; i
< n
[d
]; i
++)
1830 /* The second check should match the parameters
1831 * of the pmegrid_init call above.
1833 while (t
+ 1 < grids
->nc
[d
] && i
>= (n
[d
]*(t
+1))/grids
->nc
[d
])
1837 grids
->g2t
[d
][i
] = t
*tfac
;
1840 tfac
*= grids
->nc
[d
];
1844 case XX
: max_comm_lines
= overlap_x
; break;
1845 case YY
: max_comm_lines
= overlap_y
; break;
1846 case ZZ
: max_comm_lines
= pme_order
- 1; break;
1848 grids
->nthread_comm
[d
] = 0;
1849 while ((n
[d
]*grids
->nthread_comm
[d
])/grids
->nc
[d
] < max_comm_lines
&&
1850 grids
->nthread_comm
[d
] < grids
->nc
[d
])
1852 grids
->nthread_comm
[d
]++;
1856 fprintf(debug
, "pmegrid thread grid communication range in %c: %d\n",
1857 'x'+d
, grids
->nthread_comm
[d
]);
1859 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1860 * work, but this is not a problematic restriction.
1862 if (grids
->nc
[d
] > 1 && grids
->nthread_comm
[d
] > grids
->nc
[d
])
1864 gmx_fatal(FARGS
, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids
->nthread
);
1870 static void pmegrids_destroy(pmegrids_t
*grids
)
1874 if (grids
->grid
.grid
!= NULL
)
1876 sfree(grids
->grid
.grid
);
1878 if (grids
->nthread
> 0)
1880 for (t
= 0; t
< grids
->nthread
; t
++)
1882 sfree(grids
->grid_th
[t
].grid
);
1884 sfree(grids
->grid_th
);
1890 static void realloc_work(pme_work_t
*work
, int nkx
)
1894 if (nkx
> work
->nalloc
)
1897 srenew(work
->mhx
, work
->nalloc
);
1898 srenew(work
->mhy
, work
->nalloc
);
1899 srenew(work
->mhz
, work
->nalloc
);
1900 srenew(work
->m2
, work
->nalloc
);
1901 /* Allocate an aligned pointer for SIMD operations, including extra
1902 * elements at the end for padding.
1905 simd_width
= GMX_SIMD_WIDTH_HERE
;
1907 /* We can use any alignment, apart from 0, so we use 4 */
1910 sfree_aligned(work
->denom
);
1911 sfree_aligned(work
->tmp1
);
1912 sfree_aligned(work
->eterm
);
1913 snew_aligned(work
->denom
, work
->nalloc
+simd_width
, simd_width
*sizeof(real
));
1914 snew_aligned(work
->tmp1
, work
->nalloc
+simd_width
, simd_width
*sizeof(real
));
1915 snew_aligned(work
->eterm
, work
->nalloc
+simd_width
, simd_width
*sizeof(real
));
1916 srenew(work
->m2inv
, work
->nalloc
);
1921 static void free_work(pme_work_t
*work
)
1927 sfree_aligned(work
->denom
);
1928 sfree_aligned(work
->tmp1
);
1929 sfree_aligned(work
->eterm
);
1935 /* Calculate exponentials through SIMD */
1936 inline static void calc_exponentials(int start
, int end
, real f
, real
*d_aligned
, real
*r_aligned
, real
*e_aligned
)
1939 const gmx_mm_pr two
= gmx_set1_pr(2.0);
1942 gmx_mm_pr tmp_d1
, d_inv
, tmp_r
, tmp_e
;
1944 f_simd
= gmx_set1_pr(f
);
1945 for (kx
= 0; kx
< end
; kx
+= GMX_SIMD_WIDTH_HERE
)
1947 tmp_d1
= gmx_load_pr(d_aligned
+kx
);
1948 d_inv
= gmx_inv_pr(tmp_d1
);
1949 tmp_r
= gmx_load_pr(r_aligned
+kx
);
1950 tmp_r
= gmx_exp_pr(tmp_r
);
1951 tmp_e
= gmx_mul_pr(f_simd
, d_inv
);
1952 tmp_e
= gmx_mul_pr(tmp_e
, tmp_r
);
1953 gmx_store_pr(e_aligned
+kx
, tmp_e
);
1958 inline static void calc_exponentials(int start
, int end
, real f
, real
*d
, real
*r
, real
*e
)
1961 for (kx
= start
; kx
< end
; kx
++)
1965 for (kx
= start
; kx
< end
; kx
++)
1969 for (kx
= start
; kx
< end
; kx
++)
1971 e
[kx
] = f
*r
[kx
]*d
[kx
];
1977 static int solve_pme_yzx(gmx_pme_t pme
, t_complex
*grid
,
1978 real ewaldcoeff
, real vol
,
1980 int nthread
, int thread
)
1982 /* do recip sum over local cells in grid */
1983 /* y major, z middle, x minor or continuous */
1985 int kx
, ky
, kz
, maxkx
, maxky
, maxkz
;
1986 int nx
, ny
, nz
, iyz0
, iyz1
, iyz
, iy
, iz
, kxstart
, kxend
;
1988 real factor
= M_PI
*M_PI
/(ewaldcoeff
*ewaldcoeff
);
1989 real ets2
, struct2
, vfactor
, ets2vf
;
1990 real d1
, d2
, energy
= 0;
1992 real virxx
= 0, virxy
= 0, virxz
= 0, viryy
= 0, viryz
= 0, virzz
= 0;
1993 real rxx
, ryx
, ryy
, rzx
, rzy
, rzz
;
1995 real
*mhx
, *mhy
, *mhz
, *m2
, *denom
, *tmp1
, *eterm
, *m2inv
;
1996 real mhxk
, mhyk
, mhzk
, m2k
;
1999 ivec local_ndata
, local_offset
, local_size
;
2002 elfac
= ONE_4PI_EPS0
/pme
->epsilon_r
;
2008 /* Dimensions should be identical for A/B grid, so we just use A here */
2009 gmx_parallel_3dfft_complex_limits(pme
->pfft_setupA
,
2015 rxx
= pme
->recipbox
[XX
][XX
];
2016 ryx
= pme
->recipbox
[YY
][XX
];
2017 ryy
= pme
->recipbox
[YY
][YY
];
2018 rzx
= pme
->recipbox
[ZZ
][XX
];
2019 rzy
= pme
->recipbox
[ZZ
][YY
];
2020 rzz
= pme
->recipbox
[ZZ
][ZZ
];
2026 work
= &pme
->work
[thread
];
2031 denom
= work
->denom
;
2033 eterm
= work
->eterm
;
2034 m2inv
= work
->m2inv
;
2036 iyz0
= local_ndata
[YY
]*local_ndata
[ZZ
]* thread
/nthread
;
2037 iyz1
= local_ndata
[YY
]*local_ndata
[ZZ
]*(thread
+1)/nthread
;
2039 for (iyz
= iyz0
; iyz
< iyz1
; iyz
++)
2041 iy
= iyz
/local_ndata
[ZZ
];
2042 iz
= iyz
- iy
*local_ndata
[ZZ
];
2044 ky
= iy
+ local_offset
[YY
];
2055 by
= M_PI
*vol
*pme
->bsp_mod
[YY
][ky
];
2057 kz
= iz
+ local_offset
[ZZ
];
2061 bz
= pme
->bsp_mod
[ZZ
][kz
];
2063 /* 0.5 correction for corner points */
2065 if (kz
== 0 || kz
== (nz
+1)/2)
2070 p0
= grid
+ iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
2072 /* We should skip the k-space point (0,0,0) */
2073 if (local_offset
[XX
] > 0 || ky
> 0 || kz
> 0)
2075 kxstart
= local_offset
[XX
];
2079 kxstart
= local_offset
[XX
] + 1;
2082 kxend
= local_offset
[XX
] + local_ndata
[XX
];
2086 /* More expensive inner loop, especially because of the storage
2087 * of the mh elements in array's.
2088 * Because x is the minor grid index, all mh elements
2089 * depend on kx for triclinic unit cells.
2092 /* Two explicit loops to avoid a conditional inside the loop */
2093 for (kx
= kxstart
; kx
< maxkx
; kx
++)
2098 mhyk
= mx
* ryx
+ my
* ryy
;
2099 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
2100 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
2105 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
2106 tmp1
[kx
] = -factor
*m2k
;
2109 for (kx
= maxkx
; kx
< kxend
; kx
++)
2114 mhyk
= mx
* ryx
+ my
* ryy
;
2115 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
2116 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
2121 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
2122 tmp1
[kx
] = -factor
*m2k
;
2125 for (kx
= kxstart
; kx
< kxend
; kx
++)
2127 m2inv
[kx
] = 1.0/m2
[kx
];
2130 calc_exponentials(kxstart
, kxend
, elfac
, denom
, tmp1
, eterm
);
2132 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
2137 p0
->re
= d1
*eterm
[kx
];
2138 p0
->im
= d2
*eterm
[kx
];
2140 struct2
= 2.0*(d1
*d1
+d2
*d2
);
2142 tmp1
[kx
] = eterm
[kx
]*struct2
;
2145 for (kx
= kxstart
; kx
< kxend
; kx
++)
2147 ets2
= corner_fac
*tmp1
[kx
];
2148 vfactor
= (factor
*m2
[kx
] + 1.0)*2.0*m2inv
[kx
];
2151 ets2vf
= ets2
*vfactor
;
2152 virxx
+= ets2vf
*mhx
[kx
]*mhx
[kx
] - ets2
;
2153 virxy
+= ets2vf
*mhx
[kx
]*mhy
[kx
];
2154 virxz
+= ets2vf
*mhx
[kx
]*mhz
[kx
];
2155 viryy
+= ets2vf
*mhy
[kx
]*mhy
[kx
] - ets2
;
2156 viryz
+= ets2vf
*mhy
[kx
]*mhz
[kx
];
2157 virzz
+= ets2vf
*mhz
[kx
]*mhz
[kx
] - ets2
;
2162 /* We don't need to calculate the energy and the virial.
2163 * In this case the triclinic overhead is small.
2166 /* Two explicit loops to avoid a conditional inside the loop */
2168 for (kx
= kxstart
; kx
< maxkx
; kx
++)
2173 mhyk
= mx
* ryx
+ my
* ryy
;
2174 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
2175 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
2176 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
2177 tmp1
[kx
] = -factor
*m2k
;
2180 for (kx
= maxkx
; kx
< kxend
; kx
++)
2185 mhyk
= mx
* ryx
+ my
* ryy
;
2186 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
2187 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
2188 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
2189 tmp1
[kx
] = -factor
*m2k
;
2192 calc_exponentials(kxstart
, kxend
, elfac
, denom
, tmp1
, eterm
);
2194 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
2199 p0
->re
= d1
*eterm
[kx
];
2200 p0
->im
= d2
*eterm
[kx
];
2207 /* Update virial with local values.
2208 * The virial is symmetric by definition.
2209 * this virial seems ok for isotropic scaling, but I'm
2210 * experiencing problems on semiisotropic membranes.
2211 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2213 work
->vir
[XX
][XX
] = 0.25*virxx
;
2214 work
->vir
[YY
][YY
] = 0.25*viryy
;
2215 work
->vir
[ZZ
][ZZ
] = 0.25*virzz
;
2216 work
->vir
[XX
][YY
] = work
->vir
[YY
][XX
] = 0.25*virxy
;
2217 work
->vir
[XX
][ZZ
] = work
->vir
[ZZ
][XX
] = 0.25*virxz
;
2218 work
->vir
[YY
][ZZ
] = work
->vir
[ZZ
][YY
] = 0.25*viryz
;
2220 /* This energy should be corrected for a charged system */
2221 work
->energy
= 0.5*energy
;
2224 /* Return the loop count */
2225 return local_ndata
[YY
]*local_ndata
[XX
];
2228 static void get_pme_ener_vir(const gmx_pme_t pme
, int nthread
,
2229 real
*mesh_energy
, matrix vir
)
2231 /* This function sums output over threads
2232 * and should therefore only be called after thread synchronization.
2236 *mesh_energy
= pme
->work
[0].energy
;
2237 copy_mat(pme
->work
[0].vir
, vir
);
2239 for (thread
= 1; thread
< nthread
; thread
++)
2241 *mesh_energy
+= pme
->work
[thread
].energy
;
2242 m_add(vir
, pme
->work
[thread
].vir
, vir
);
2246 #define DO_FSPLINE(order) \
2247 for (ithx = 0; (ithx < order); ithx++) \
2249 index_x = (i0+ithx)*pny*pnz; \
2253 for (ithy = 0; (ithy < order); ithy++) \
2255 index_xy = index_x+(j0+ithy)*pnz; \
2260 for (ithz = 0; (ithz < order); ithz++) \
2262 gval = grid[index_xy+(k0+ithz)]; \
2263 fxy1 += thz[ithz]*gval; \
2264 fz1 += dthz[ithz]*gval; \
2273 static void gather_f_bsplines(gmx_pme_t pme
, real
*grid
,
2274 gmx_bool bClearF
, pme_atomcomm_t
*atc
,
2275 splinedata_t
*spline
,
2278 /* sum forces for local particles */
2279 int nn
, n
, ithx
, ithy
, ithz
, i0
, j0
, k0
;
2280 int index_x
, index_xy
;
2281 int nx
, ny
, nz
, pnx
, pny
, pnz
;
2283 real tx
, ty
, dx
, dy
, qn
;
2284 real fx
, fy
, fz
, gval
;
2286 real
*thx
, *thy
, *thz
, *dthx
, *dthy
, *dthz
;
2288 real rxx
, ryx
, ryy
, rzx
, rzy
, rzz
;
2291 pme_spline_work_t
*work
;
2293 #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
2294 real thz_buffer
[12], *thz_aligned
;
2295 real dthz_buffer
[12], *dthz_aligned
;
2297 thz_aligned
= gmx_simd4_align_real(thz_buffer
);
2298 dthz_aligned
= gmx_simd4_align_real(dthz_buffer
);
2301 work
= pme
->spline_work
;
2303 order
= pme
->pme_order
;
2304 thx
= spline
->theta
[XX
];
2305 thy
= spline
->theta
[YY
];
2306 thz
= spline
->theta
[ZZ
];
2307 dthx
= spline
->dtheta
[XX
];
2308 dthy
= spline
->dtheta
[YY
];
2309 dthz
= spline
->dtheta
[ZZ
];
2313 pnx
= pme
->pmegrid_nx
;
2314 pny
= pme
->pmegrid_ny
;
2315 pnz
= pme
->pmegrid_nz
;
2317 rxx
= pme
->recipbox
[XX
][XX
];
2318 ryx
= pme
->recipbox
[YY
][XX
];
2319 ryy
= pme
->recipbox
[YY
][YY
];
2320 rzx
= pme
->recipbox
[ZZ
][XX
];
2321 rzy
= pme
->recipbox
[ZZ
][YY
];
2322 rzz
= pme
->recipbox
[ZZ
][ZZ
];
2324 for (nn
= 0; nn
< spline
->n
; nn
++)
2326 n
= spline
->ind
[nn
];
2327 qn
= scale
*atc
->q
[n
];
2340 idxptr
= atc
->idx
[n
];
2347 /* Pointer arithmetic alert, next six statements */
2348 thx
= spline
->theta
[XX
] + norder
;
2349 thy
= spline
->theta
[YY
] + norder
;
2350 thz
= spline
->theta
[ZZ
] + norder
;
2351 dthx
= spline
->dtheta
[XX
] + norder
;
2352 dthy
= spline
->dtheta
[YY
] + norder
;
2353 dthz
= spline
->dtheta
[ZZ
] + norder
;
2358 #ifdef PME_SIMD4_SPREAD_GATHER
2359 #ifdef PME_SIMD4_UNALIGNED
2360 #define PME_GATHER_F_SIMD4_ORDER4
2362 #define PME_GATHER_F_SIMD4_ALIGNED
2365 #include "pme_simd4.h"
2371 #ifdef PME_SIMD4_SPREAD_GATHER
2372 #define PME_GATHER_F_SIMD4_ALIGNED
2374 #include "pme_simd4.h"
2384 atc
->f
[n
][XX
] += -qn
*( fx
*nx
*rxx
);
2385 atc
->f
[n
][YY
] += -qn
*( fx
*nx
*ryx
+ fy
*ny
*ryy
);
2386 atc
->f
[n
][ZZ
] += -qn
*( fx
*nx
*rzx
+ fy
*ny
*rzy
+ fz
*nz
*rzz
);
2389 /* Since the energy and not forces are interpolated
2390 * the net force might not be exactly zero.
2391 * This can be solved by also interpolating F, but
2392 * that comes at a cost.
2393 * A better hack is to remove the net force every
2394 * step, but that must be done at a higher level
2395 * since this routine doesn't see all atoms if running
2396 * in parallel. Don't know how important it is? EL 990726
2401 static real
gather_energy_bsplines(gmx_pme_t pme
, real
*grid
,
2402 pme_atomcomm_t
*atc
)
2404 splinedata_t
*spline
;
2405 int n
, ithx
, ithy
, ithz
, i0
, j0
, k0
;
2406 int index_x
, index_xy
;
2408 real energy
, pot
, tx
, ty
, qn
, gval
;
2409 real
*thx
, *thy
, *thz
;
2413 spline
= &atc
->spline
[0];
2415 order
= pme
->pme_order
;
2418 for (n
= 0; (n
< atc
->n
); n
++)
2424 idxptr
= atc
->idx
[n
];
2431 /* Pointer arithmetic alert, next three statements */
2432 thx
= spline
->theta
[XX
] + norder
;
2433 thy
= spline
->theta
[YY
] + norder
;
2434 thz
= spline
->theta
[ZZ
] + norder
;
2437 for (ithx
= 0; (ithx
< order
); ithx
++)
2439 index_x
= (i0
+ithx
)*pme
->pmegrid_ny
*pme
->pmegrid_nz
;
2442 for (ithy
= 0; (ithy
< order
); ithy
++)
2444 index_xy
= index_x
+(j0
+ithy
)*pme
->pmegrid_nz
;
2447 for (ithz
= 0; (ithz
< order
); ithz
++)
2449 gval
= grid
[index_xy
+(k0
+ithz
)];
2450 pot
+= tx
*ty
*thz
[ithz
]*gval
;
2463 /* Macro to force loop unrolling by fixing order.
2464 * This gives a significant performance gain.
2466 #define CALC_SPLINE(order) \
2470 real data[PME_ORDER_MAX]; \
2471 real ddata[PME_ORDER_MAX]; \
2473 for (j = 0; (j < DIM); j++) \
2477 /* dr is relative offset from lower cell limit */ \
2478 data[order-1] = 0; \
2482 for (k = 3; (k < order); k++) \
2484 div = 1.0/(k - 1.0); \
2485 data[k-1] = div*dr*data[k-2]; \
2486 for (l = 1; (l < (k-1)); l++) \
2488 data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
2491 data[0] = div*(1-dr)*data[0]; \
2493 /* differentiate */ \
2494 ddata[0] = -data[0]; \
2495 for (k = 1; (k < order); k++) \
2497 ddata[k] = data[k-1] - data[k]; \
2500 div = 1.0/(order - 1); \
2501 data[order-1] = div*dr*data[order-2]; \
2502 for (l = 1; (l < (order-1)); l++) \
2504 data[order-l-1] = div*((dr+l)*data[order-l-2]+ \
2505 (order-l-dr)*data[order-l-1]); \
2507 data[0] = div*(1 - dr)*data[0]; \
2509 for (k = 0; k < order; k++) \
2511 theta[j][i*order+k] = data[k]; \
2512 dtheta[j][i*order+k] = ddata[k]; \
2517 void make_bsplines(splinevec theta
, splinevec dtheta
, int order
,
2518 rvec fractx
[], int nr
, int ind
[], real charge
[],
2519 gmx_bool bFreeEnergy
)
2521 /* construct splines for local atoms */
2525 for (i
= 0; i
< nr
; i
++)
2527 /* With free energy we do not use the charge check.
2528 * In most cases this will be more efficient than calling make_bsplines
2529 * twice, since usually more than half the particles have charges.
2532 if (bFreeEnergy
|| charge
[ii
] != 0.0)
2537 case 4: CALC_SPLINE(4); break;
2538 case 5: CALC_SPLINE(5); break;
2539 default: CALC_SPLINE(order
); break;
2546 void make_dft_mod(real
*mod
, real
*data
, int ndata
)
2551 for (i
= 0; i
< ndata
; i
++)
2554 for (j
= 0; j
< ndata
; j
++)
2556 arg
= (2.0*M_PI
*i
*j
)/ndata
;
2557 sc
+= data
[j
]*cos(arg
);
2558 ss
+= data
[j
]*sin(arg
);
2560 mod
[i
] = sc
*sc
+ss
*ss
;
2562 for (i
= 0; i
< ndata
; i
++)
2566 mod
[i
] = (mod
[i
-1]+mod
[i
+1])*0.5;
2572 static void make_bspline_moduli(splinevec bsp_mod
,
2573 int nx
, int ny
, int nz
, int order
)
2575 int nmax
= max(nx
, max(ny
, nz
));
2576 real
*data
, *ddata
, *bsp_data
;
2582 snew(bsp_data
, nmax
);
2588 for (k
= 3; k
< order
; k
++)
2592 for (l
= 1; l
< (k
-1); l
++)
2594 data
[k
-l
-1] = div
*(l
*data
[k
-l
-2]+(k
-l
)*data
[k
-l
-1]);
2596 data
[0] = div
*data
[0];
2599 ddata
[0] = -data
[0];
2600 for (k
= 1; k
< order
; k
++)
2602 ddata
[k
] = data
[k
-1]-data
[k
];
2604 div
= 1.0/(order
-1);
2606 for (l
= 1; l
< (order
-1); l
++)
2608 data
[order
-l
-1] = div
*(l
*data
[order
-l
-2]+(order
-l
)*data
[order
-l
-1]);
2610 data
[0] = div
*data
[0];
2612 for (i
= 0; i
< nmax
; i
++)
2616 for (i
= 1; i
<= order
; i
++)
2618 bsp_data
[i
] = data
[i
-1];
2621 make_dft_mod(bsp_mod
[XX
], bsp_data
, nx
);
2622 make_dft_mod(bsp_mod
[YY
], bsp_data
, ny
);
2623 make_dft_mod(bsp_mod
[ZZ
], bsp_data
, nz
);
2631 /* Return the P3M optimal influence function */
2632 static double do_p3m_influence(double z
, int order
)
2639 /* The formula and most constants can be found in:
2640 * Ballenegger et al., JCTC 8, 936 (2012)
2645 return 1.0 - 2.0*z2
/3.0;
2648 return 1.0 - z2
+ 2.0*z4
/15.0;
2651 return 1.0 - 4.0*z2
/3.0 + 2.0*z4
/5.0 + 4.0*z2
*z4
/315.0;
2654 return 1.0 - 5.0*z2
/3.0 + 7.0*z4
/9.0 - 17.0*z2
*z4
/189.0 + 2.0*z4
*z4
/2835.0;
2657 return 1.0 - 2.0*z2
+ 19.0*z4
/15.0 - 256.0*z2
*z4
/945.0 + 62.0*z4
*z4
/4725.0 + 4.0*z2
*z4
*z4
/155925.0;
2660 return 1.0 - 7.0*z2
/3.0 + 28.0*z4
/15.0 - 16.0*z2
*z4
/27.0 + 26.0*z4
*z4
/405.0 - 2.0*z2
*z4
*z4
/1485.0 + 4.0*z4
*z4
*z4
/6081075.0;
2662 return 1.0 - 8.0*z2
/3.0 + 116.0*z4
/45.0 - 344.0*z2
*z4
/315.0 + 914.0*z4
*z4
/4725.0 - 248.0*z4
*z4
*z2
/22275.0 + 21844.0*z4
*z4
*z4
/212837625.0 - 8.0*z4
*z4
*z4
*z2
/638512875.0;
2669 /* Calculate the P3M B-spline moduli for one dimension */
2670 static void make_p3m_bspline_moduli_dim(real
*bsp_mod
, int n
, int order
)
2672 double zarg
, zai
, sinzai
, infl
;
2677 gmx_fatal(FARGS
, "The current P3M code only supports orders up to 8");
2684 for (i
= -maxk
; i
< 0; i
++)
2688 infl
= do_p3m_influence(sinzai
, order
);
2689 bsp_mod
[n
+i
] = infl
*infl
*pow(sinzai
/zai
, -2.0*order
);
2692 for (i
= 1; i
< maxk
; i
++)
2696 infl
= do_p3m_influence(sinzai
, order
);
2697 bsp_mod
[i
] = infl
*infl
*pow(sinzai
/zai
, -2.0*order
);
2701 /* Calculate the P3M B-spline moduli */
2702 static void make_p3m_bspline_moduli(splinevec bsp_mod
,
2703 int nx
, int ny
, int nz
, int order
)
2705 make_p3m_bspline_moduli_dim(bsp_mod
[XX
], nx
, order
);
2706 make_p3m_bspline_moduli_dim(bsp_mod
[YY
], ny
, order
);
2707 make_p3m_bspline_moduli_dim(bsp_mod
[ZZ
], nz
, order
);
2711 static void setup_coordinate_communication(pme_atomcomm_t
*atc
)
2719 for (i
= 1; i
<= nslab
/2; i
++)
2721 fw
= (atc
->nodeid
+ i
) % nslab
;
2722 bw
= (atc
->nodeid
- i
+ nslab
) % nslab
;
2725 atc
->node_dest
[n
] = fw
;
2726 atc
->node_src
[n
] = bw
;
2731 atc
->node_dest
[n
] = bw
;
2732 atc
->node_src
[n
] = fw
;
2738 int gmx_pme_destroy(FILE *log
, gmx_pme_t
*pmedata
)
2744 fprintf(log
, "Destroying PME data structures.\n");
2747 sfree((*pmedata
)->nnx
);
2748 sfree((*pmedata
)->nny
);
2749 sfree((*pmedata
)->nnz
);
2751 pmegrids_destroy(&(*pmedata
)->pmegridA
);
2753 sfree((*pmedata
)->fftgridA
);
2754 sfree((*pmedata
)->cfftgridA
);
2755 gmx_parallel_3dfft_destroy((*pmedata
)->pfft_setupA
);
2757 if ((*pmedata
)->pmegridB
.grid
.grid
!= NULL
)
2759 pmegrids_destroy(&(*pmedata
)->pmegridB
);
2760 sfree((*pmedata
)->fftgridB
);
2761 sfree((*pmedata
)->cfftgridB
);
2762 gmx_parallel_3dfft_destroy((*pmedata
)->pfft_setupB
);
2764 for (thread
= 0; thread
< (*pmedata
)->nthread
; thread
++)
2766 free_work(&(*pmedata
)->work
[thread
]);
2768 sfree((*pmedata
)->work
);
2776 static int mult_up(int n
, int f
)
2778 return ((n
+ f
- 1)/f
)*f
;
2782 static double pme_load_imbalance(gmx_pme_t pme
)
2787 nma
= pme
->nnodes_major
;
2788 nmi
= pme
->nnodes_minor
;
2790 n1
= mult_up(pme
->nkx
, nma
)*mult_up(pme
->nky
, nmi
)*pme
->nkz
;
2791 n2
= mult_up(pme
->nkx
, nma
)*mult_up(pme
->nkz
, nmi
)*pme
->nky
;
2792 n3
= mult_up(pme
->nky
, nma
)*mult_up(pme
->nkz
, nmi
)*pme
->nkx
;
2794 /* pme_solve is roughly double the cost of an fft */
2796 return (n1
+ n2
+ 3*n3
)/(double)(6*pme
->nkx
*pme
->nky
*pme
->nkz
);
2799 static void init_atomcomm(gmx_pme_t pme
, pme_atomcomm_t
*atc
, t_commrec
*cr
,
2800 int dimind
, gmx_bool bSpread
)
2802 int nk
, k
, s
, thread
;
2804 atc
->dimind
= dimind
;
2809 if (pme
->nnodes
> 1)
2811 atc
->mpi_comm
= pme
->mpi_comm_d
[dimind
];
2812 MPI_Comm_size(atc
->mpi_comm
, &atc
->nslab
);
2813 MPI_Comm_rank(atc
->mpi_comm
, &atc
->nodeid
);
2817 fprintf(debug
, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc
->dimind
, atc
->nslab
, atc
->nodeid
);
2821 atc
->bSpread
= bSpread
;
2822 atc
->pme_order
= pme
->pme_order
;
2826 /* These three allocations are not required for particle decomp. */
2827 snew(atc
->node_dest
, atc
->nslab
);
2828 snew(atc
->node_src
, atc
->nslab
);
2829 setup_coordinate_communication(atc
);
2831 snew(atc
->count_thread
, pme
->nthread
);
2832 for (thread
= 0; thread
< pme
->nthread
; thread
++)
2834 snew(atc
->count_thread
[thread
], atc
->nslab
);
2836 atc
->count
= atc
->count_thread
[0];
2837 snew(atc
->rcount
, atc
->nslab
);
2838 snew(atc
->buf_index
, atc
->nslab
);
2841 atc
->nthread
= pme
->nthread
;
2842 if (atc
->nthread
> 1)
2844 snew(atc
->thread_plist
, atc
->nthread
);
2846 snew(atc
->spline
, atc
->nthread
);
2847 for (thread
= 0; thread
< atc
->nthread
; thread
++)
2849 if (atc
->nthread
> 1)
2851 snew(atc
->thread_plist
[thread
].n
, atc
->nthread
+2*GMX_CACHE_SEP
);
2852 atc
->thread_plist
[thread
].n
+= GMX_CACHE_SEP
;
2854 snew(atc
->spline
[thread
].thread_one
, pme
->nthread
);
2855 atc
->spline
[thread
].thread_one
[thread
] = 1;
2860 init_overlap_comm(pme_overlap_t
* ol
,
2870 int lbnd
, rbnd
, maxlr
, b
, i
;
2873 pme_grid_comm_t
*pgc
;
2875 int fft_start
, fft_end
, send_index1
, recv_index1
;
2879 ol
->mpi_comm
= comm
;
2882 ol
->nnodes
= nnodes
;
2883 ol
->nodeid
= nodeid
;
2885 /* Linear translation of the PME grid won't affect reciprocal space
2886 * calculations, so to optimize we only interpolate "upwards",
2887 * which also means we only have to consider overlap in one direction.
2888 * I.e., particles on this node might also be spread to grid indices
2889 * that belong to higher nodes (modulo nnodes)
2892 snew(ol
->s2g0
, ol
->nnodes
+1);
2893 snew(ol
->s2g1
, ol
->nnodes
);
2896 fprintf(debug
, "PME slab boundaries:");
2898 for (i
= 0; i
< nnodes
; i
++)
2900 /* s2g0 the local interpolation grid start.
2901 * s2g1 the local interpolation grid end.
2902 * Because grid overlap communication only goes forward,
2903 * the grid the slabs for fft's should be rounded down.
2905 ol
->s2g0
[i
] = ( i
*ndata
+ 0 )/nnodes
;
2906 ol
->s2g1
[i
] = ((i
+1)*ndata
+ nnodes
-1)/nnodes
+ norder
- 1;
2910 fprintf(debug
, " %3d %3d", ol
->s2g0
[i
], ol
->s2g1
[i
]);
2913 ol
->s2g0
[nnodes
] = ndata
;
2916 fprintf(debug
, "\n");
2919 /* Determine with how many nodes we need to communicate the grid overlap */
2925 for (i
= 0; i
< nnodes
; i
++)
2927 if ((i
+b
< nnodes
&& ol
->s2g1
[i
] > ol
->s2g0
[i
+b
]) ||
2928 (i
+b
>= nnodes
&& ol
->s2g1
[i
] > ol
->s2g0
[i
+b
-nnodes
] + ndata
))
2934 while (bCont
&& b
< nnodes
);
2935 ol
->noverlap_nodes
= b
- 1;
2937 snew(ol
->send_id
, ol
->noverlap_nodes
);
2938 snew(ol
->recv_id
, ol
->noverlap_nodes
);
2939 for (b
= 0; b
< ol
->noverlap_nodes
; b
++)
2941 ol
->send_id
[b
] = (ol
->nodeid
+ (b
+ 1)) % ol
->nnodes
;
2942 ol
->recv_id
[b
] = (ol
->nodeid
- (b
+ 1) + ol
->nnodes
) % ol
->nnodes
;
2944 snew(ol
->comm_data
, ol
->noverlap_nodes
);
2947 for (b
= 0; b
< ol
->noverlap_nodes
; b
++)
2949 pgc
= &ol
->comm_data
[b
];
2951 fft_start
= ol
->s2g0
[ol
->send_id
[b
]];
2952 fft_end
= ol
->s2g0
[ol
->send_id
[b
]+1];
2953 if (ol
->send_id
[b
] < nodeid
)
2958 send_index1
= ol
->s2g1
[nodeid
];
2959 send_index1
= min(send_index1
, fft_end
);
2960 pgc
->send_index0
= fft_start
;
2961 pgc
->send_nindex
= max(0, send_index1
- pgc
->send_index0
);
2962 ol
->send_size
+= pgc
->send_nindex
;
2964 /* We always start receiving to the first index of our slab */
2965 fft_start
= ol
->s2g0
[ol
->nodeid
];
2966 fft_end
= ol
->s2g0
[ol
->nodeid
+1];
2967 recv_index1
= ol
->s2g1
[ol
->recv_id
[b
]];
2968 if (ol
->recv_id
[b
] > nodeid
)
2970 recv_index1
-= ndata
;
2972 recv_index1
= min(recv_index1
, fft_end
);
2973 pgc
->recv_index0
= fft_start
;
2974 pgc
->recv_nindex
= max(0, recv_index1
- pgc
->recv_index0
);
2978 /* Communicate the buffer sizes to receive */
2979 for (b
= 0; b
< ol
->noverlap_nodes
; b
++)
2981 MPI_Sendrecv(&ol
->send_size
, 1, MPI_INT
, ol
->send_id
[b
], b
,
2982 &ol
->comm_data
[b
].recv_size
, 1, MPI_INT
, ol
->recv_id
[b
], b
,
2983 ol
->mpi_comm
, &stat
);
2987 /* For non-divisible grid we need pme_order iso pme_order-1 */
2988 snew(ol
->sendbuf
, norder
*commplainsize
);
2989 snew(ol
->recvbuf
, norder
*commplainsize
);
2993 make_gridindex5_to_localindex(int n
, int local_start
, int local_range
,
2994 int **global_to_local
,
2995 real
**fraction_shift
)
3003 for (i
= 0; (i
< 5*n
); i
++)
3005 /* Determine the global to local grid index */
3006 gtl
[i
] = (i
- local_start
+ n
) % n
;
3007 /* For coordinates that fall within the local grid the fraction
3008 * is correct, we don't need to shift it.
3011 if (local_range
< n
)
3013 /* Due to rounding issues i could be 1 beyond the lower or
3014 * upper boundary of the local grid. Correct the index for this.
3015 * If we shift the index, we need to shift the fraction by
3016 * the same amount in the other direction to not affect
3018 * Note that due to this shifting the weights at the end of
3019 * the spline might change, but that will only involve values
3020 * between zero and values close to the precision of a real,
3021 * which is anyhow the accuracy of the whole mesh calculation.
3023 /* With local_range=0 we should not change i=local_start */
3024 if (i
% n
!= local_start
)
3031 else if (gtl
[i
] == local_range
)
3033 gtl
[i
] = local_range
- 1;
3040 *global_to_local
= gtl
;
3041 *fraction_shift
= fsh
;
3044 static pme_spline_work_t
*make_pme_spline_work(int order
)
3046 pme_spline_work_t
*work
;
3048 #ifdef PME_SIMD4_SPREAD_GATHER
3049 real tmp
[12], *tmp_aligned
;
3050 gmx_simd4_pr zero_S
;
3051 gmx_simd4_pr real_mask_S0
, real_mask_S1
;
3054 snew_aligned(work
, 1, SIMD4_ALIGNMENT
);
3056 tmp_aligned
= gmx_simd4_align_real(tmp
);
3058 zero_S
= gmx_simd4_setzero_pr();
3060 /* Generate bit masks to mask out the unused grid entries,
3061 * as we only operate on order of the 8 grid entries that are
3062 * load into 2 SIMD registers.
3064 for (of
= 0; of
< 8-(order
-1); of
++)
3066 for (i
= 0; i
< 8; i
++)
3068 tmp_aligned
[i
] = (i
>= of
&& i
< of
+order
? -1.0 : 1.0);
3070 real_mask_S0
= gmx_simd4_load_pr(tmp_aligned
);
3071 real_mask_S1
= gmx_simd4_load_pr(tmp_aligned
+4);
3072 work
->mask_S0
[of
] = gmx_simd4_cmplt_pr(real_mask_S0
, zero_S
);
3073 work
->mask_S1
[of
] = gmx_simd4_cmplt_pr(real_mask_S1
, zero_S
);
3082 void gmx_pme_check_restrictions(int pme_order
,
3083 int nkx
, int nky
, int nkz
,
3086 gmx_bool bUseThreads
,
3088 gmx_bool
*bValidSettings
)
3090 if (pme_order
> PME_ORDER_MAX
)
3094 *bValidSettings
= FALSE
;
3097 gmx_fatal(FARGS
, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
3098 pme_order
, PME_ORDER_MAX
);
3101 if (nkx
<= pme_order
*(nnodes_major
> 1 ? 2 : 1) ||
3102 nky
<= pme_order
*(nnodes_minor
> 1 ? 2 : 1) ||
3107 *bValidSettings
= FALSE
;
3110 gmx_fatal(FARGS
, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
3114 /* Check for a limitation of the (current) sum_fftgrid_dd code.
3115 * We only allow multiple communication pulses in dim 1, not in dim 0.
3117 if (bUseThreads
&& (nkx
< nnodes_major
*pme_order
&&
3118 nkx
!= nnodes_major
*(pme_order
- 1)))
3122 *bValidSettings
= FALSE
;
3125 gmx_fatal(FARGS
, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
3126 nkx
/(double)nnodes_major
, pme_order
);
3129 if (bValidSettings
!= NULL
)
3131 *bValidSettings
= TRUE
;
3137 int gmx_pme_init(gmx_pme_t
* pmedata
,
3143 gmx_bool bFreeEnergy
,
3144 gmx_bool bReproducible
,
3147 gmx_pme_t pme
= NULL
;
3149 int use_threads
, sum_use_threads
;
3154 fprintf(debug
, "Creating PME data structures.\n");
3158 pme
->redist_init
= FALSE
;
3159 pme
->sum_qgrid_tmp
= NULL
;
3160 pme
->sum_qgrid_dd_tmp
= NULL
;
3161 pme
->buf_nalloc
= 0;
3162 pme
->redist_buf_nalloc
= 0;
3165 pme
->bPPnode
= TRUE
;
3167 pme
->nnodes_major
= nnodes_major
;
3168 pme
->nnodes_minor
= nnodes_minor
;
3171 if (nnodes_major
*nnodes_minor
> 1)
3173 pme
->mpi_comm
= cr
->mpi_comm_mygroup
;
3175 MPI_Comm_rank(pme
->mpi_comm
, &pme
->nodeid
);
3176 MPI_Comm_size(pme
->mpi_comm
, &pme
->nnodes
);
3177 if (pme
->nnodes
!= nnodes_major
*nnodes_minor
)
3179 gmx_incons("PME node count mismatch");
3184 pme
->mpi_comm
= MPI_COMM_NULL
;
3188 if (pme
->nnodes
== 1)
3191 pme
->mpi_comm_d
[0] = MPI_COMM_NULL
;
3192 pme
->mpi_comm_d
[1] = MPI_COMM_NULL
;
3194 pme
->ndecompdim
= 0;
3195 pme
->nodeid_major
= 0;
3196 pme
->nodeid_minor
= 0;
3198 pme
->mpi_comm_d
[0] = pme
->mpi_comm_d
[1] = MPI_COMM_NULL
;
3203 if (nnodes_minor
== 1)
3206 pme
->mpi_comm_d
[0] = pme
->mpi_comm
;
3207 pme
->mpi_comm_d
[1] = MPI_COMM_NULL
;
3209 pme
->ndecompdim
= 1;
3210 pme
->nodeid_major
= pme
->nodeid
;
3211 pme
->nodeid_minor
= 0;
3214 else if (nnodes_major
== 1)
3217 pme
->mpi_comm_d
[0] = MPI_COMM_NULL
;
3218 pme
->mpi_comm_d
[1] = pme
->mpi_comm
;
3220 pme
->ndecompdim
= 1;
3221 pme
->nodeid_major
= 0;
3222 pme
->nodeid_minor
= pme
->nodeid
;
3226 if (pme
->nnodes
% nnodes_major
!= 0)
3228 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
3230 pme
->ndecompdim
= 2;
3233 MPI_Comm_split(pme
->mpi_comm
, pme
->nodeid
% nnodes_minor
,
3234 pme
->nodeid
, &pme
->mpi_comm_d
[0]); /* My communicator along major dimension */
3235 MPI_Comm_split(pme
->mpi_comm
, pme
->nodeid
/nnodes_minor
,
3236 pme
->nodeid
, &pme
->mpi_comm_d
[1]); /* My communicator along minor dimension */
3238 MPI_Comm_rank(pme
->mpi_comm_d
[0], &pme
->nodeid_major
);
3239 MPI_Comm_size(pme
->mpi_comm_d
[0], &pme
->nnodes_major
);
3240 MPI_Comm_rank(pme
->mpi_comm_d
[1], &pme
->nodeid_minor
);
3241 MPI_Comm_size(pme
->mpi_comm_d
[1], &pme
->nnodes_minor
);
3244 pme
->bPPnode
= (cr
->duty
& DUTY_PP
);
3247 pme
->nthread
= nthread
;
3249 /* Check if any of the PME MPI ranks uses threads */
3250 use_threads
= (pme
->nthread
> 1 ? 1 : 0);
3252 if (pme
->nnodes
> 1)
3254 MPI_Allreduce(&use_threads
, &sum_use_threads
, 1, MPI_INT
,
3255 MPI_SUM
, pme
->mpi_comm
);
3260 sum_use_threads
= use_threads
;
3262 pme
->bUseThreads
= (sum_use_threads
> 0);
3264 if (ir
->ePBC
== epbcSCREW
)
3266 gmx_fatal(FARGS
, "pme does not (yet) work with pbc = screw");
3269 pme
->bFEP
= ((ir
->efep
!= efepNO
) && bFreeEnergy
);
3273 pme
->bP3M
= (ir
->coulombtype
== eelP3M_AD
|| getenv("GMX_PME_P3M") != NULL
);
3274 pme
->pme_order
= ir
->pme_order
;
3275 pme
->epsilon_r
= ir
->epsilon_r
;
3277 /* If we violate restrictions, generate a fatal error here */
3278 gmx_pme_check_restrictions(pme
->pme_order
,
3279 pme
->nkx
, pme
->nky
, pme
->nkz
,
3286 if (pme
->nnodes
> 1)
3291 MPI_Type_contiguous(DIM
, mpi_type
, &(pme
->rvec_mpi
));
3292 MPI_Type_commit(&(pme
->rvec_mpi
));
3295 /* Note that the charge spreading and force gathering, which usually
3296 * takes about the same amount of time as FFT+solve_pme,
3297 * is always fully load balanced
3298 * (unless the charge distribution is inhomogeneous).
3301 imbal
= pme_load_imbalance(pme
);
3302 if (imbal
>= 1.2 && pme
->nodeid_major
== 0 && pme
->nodeid_minor
== 0)
3306 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3307 " For optimal PME load balancing\n"
3308 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
3309 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
3311 (int)((imbal
-1)*100 + 0.5),
3312 pme
->nkx
, pme
->nky
, pme
->nnodes_major
,
3313 pme
->nky
, pme
->nkz
, pme
->nnodes_minor
);
3317 /* For non-divisible grid we need pme_order iso pme_order-1 */
3318 /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3319 * y is always copied through a buffer: we don't need padding in z,
3320 * but we do need the overlap in x because of the communication order.
3322 init_overlap_comm(&pme
->overlap
[0], pme
->pme_order
,
3326 pme
->nnodes_major
, pme
->nodeid_major
,
3328 (div_round_up(pme
->nky
, pme
->nnodes_minor
)+pme
->pme_order
)*(pme
->nkz
+pme
->pme_order
-1));
3330 /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
3331 * We do this with an offset buffer of equal size, so we need to allocate
3332 * extra for the offset. That's what the (+1)*pme->nkz is for.
3334 init_overlap_comm(&pme
->overlap
[1], pme
->pme_order
,
3338 pme
->nnodes_minor
, pme
->nodeid_minor
,
3340 (div_round_up(pme
->nkx
, pme
->nnodes_major
)+pme
->pme_order
+1)*pme
->nkz
);
3342 /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
3343 * Note that gmx_pme_check_restrictions checked for this already.
3345 if (pme
->bUseThreads
&& pme
->overlap
[0].noverlap_nodes
> 1)
3347 gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
3350 snew(pme
->bsp_mod
[XX
], pme
->nkx
);
3351 snew(pme
->bsp_mod
[YY
], pme
->nky
);
3352 snew(pme
->bsp_mod
[ZZ
], pme
->nkz
);
3354 /* The required size of the interpolation grid, including overlap.
3355 * The allocated size (pmegrid_n?) might be slightly larger.
3357 pme
->pmegrid_nx
= pme
->overlap
[0].s2g1
[pme
->nodeid_major
] -
3358 pme
->overlap
[0].s2g0
[pme
->nodeid_major
];
3359 pme
->pmegrid_ny
= pme
->overlap
[1].s2g1
[pme
->nodeid_minor
] -
3360 pme
->overlap
[1].s2g0
[pme
->nodeid_minor
];
3361 pme
->pmegrid_nz_base
= pme
->nkz
;
3362 pme
->pmegrid_nz
= pme
->pmegrid_nz_base
+ pme
->pme_order
- 1;
3363 set_grid_alignment(&pme
->pmegrid_nz
, pme
->pme_order
);
3365 pme
->pmegrid_start_ix
= pme
->overlap
[0].s2g0
[pme
->nodeid_major
];
3366 pme
->pmegrid_start_iy
= pme
->overlap
[1].s2g0
[pme
->nodeid_minor
];
3367 pme
->pmegrid_start_iz
= 0;
3369 make_gridindex5_to_localindex(pme
->nkx
,
3370 pme
->pmegrid_start_ix
,
3371 pme
->pmegrid_nx
- (pme
->pme_order
-1),
3372 &pme
->nnx
, &pme
->fshx
);
3373 make_gridindex5_to_localindex(pme
->nky
,
3374 pme
->pmegrid_start_iy
,
3375 pme
->pmegrid_ny
- (pme
->pme_order
-1),
3376 &pme
->nny
, &pme
->fshy
);
3377 make_gridindex5_to_localindex(pme
->nkz
,
3378 pme
->pmegrid_start_iz
,
3379 pme
->pmegrid_nz_base
,
3380 &pme
->nnz
, &pme
->fshz
);
3382 pmegrids_init(&pme
->pmegridA
,
3383 pme
->pmegrid_nx
, pme
->pmegrid_ny
, pme
->pmegrid_nz
,
3384 pme
->pmegrid_nz_base
,
3388 pme
->overlap
[0].s2g1
[pme
->nodeid_major
]-pme
->overlap
[0].s2g0
[pme
->nodeid_major
+1],
3389 pme
->overlap
[1].s2g1
[pme
->nodeid_minor
]-pme
->overlap
[1].s2g0
[pme
->nodeid_minor
+1]);
3391 pme
->spline_work
= make_pme_spline_work(pme
->pme_order
);
3393 ndata
[0] = pme
->nkx
;
3394 ndata
[1] = pme
->nky
;
3395 ndata
[2] = pme
->nkz
;
3397 /* This routine will allocate the grid data to fit the FFTs */
3398 gmx_parallel_3dfft_init(&pme
->pfft_setupA
, ndata
,
3399 &pme
->fftgridA
, &pme
->cfftgridA
,
3401 pme
->overlap
[0].s2g0
, pme
->overlap
[1].s2g0
,
3402 bReproducible
, pme
->nthread
);
3406 pmegrids_init(&pme
->pmegridB
,
3407 pme
->pmegrid_nx
, pme
->pmegrid_ny
, pme
->pmegrid_nz
,
3408 pme
->pmegrid_nz_base
,
3412 pme
->nkx
% pme
->nnodes_major
!= 0,
3413 pme
->nky
% pme
->nnodes_minor
!= 0);
3415 gmx_parallel_3dfft_init(&pme
->pfft_setupB
, ndata
,
3416 &pme
->fftgridB
, &pme
->cfftgridB
,
3418 pme
->overlap
[0].s2g0
, pme
->overlap
[1].s2g0
,
3419 bReproducible
, pme
->nthread
);
3423 pme
->pmegridB
.grid
.grid
= NULL
;
3424 pme
->fftgridB
= NULL
;
3425 pme
->cfftgridB
= NULL
;
3430 /* Use plain SPME B-spline interpolation */
3431 make_bspline_moduli(pme
->bsp_mod
, pme
->nkx
, pme
->nky
, pme
->nkz
, pme
->pme_order
);
3435 /* Use the P3M grid-optimized influence function */
3436 make_p3m_bspline_moduli(pme
->bsp_mod
, pme
->nkx
, pme
->nky
, pme
->nkz
, pme
->pme_order
);
3439 /* Use atc[0] for spreading */
3440 init_atomcomm(pme
, &pme
->atc
[0], cr
, nnodes_major
> 1 ? 0 : 1, TRUE
);
3441 if (pme
->ndecompdim
>= 2)
3443 init_atomcomm(pme
, &pme
->atc
[1], cr
, 1, FALSE
);
3446 if (pme
->nnodes
== 1)
3448 pme
->atc
[0].n
= homenr
;
3449 pme_realloc_atomcomm_things(&pme
->atc
[0]);
3455 /* Use fft5d, order after FFT is y major, z, x minor */
3457 snew(pme
->work
, pme
->nthread
);
3458 for (thread
= 0; thread
< pme
->nthread
; thread
++)
3460 realloc_work(&pme
->work
[thread
], pme
->nkx
);
3469 static void reuse_pmegrids(const pmegrids_t
*old
, pmegrids_t
*new)
3473 for (d
= 0; d
< DIM
; d
++)
3475 if (new->grid
.n
[d
] > old
->grid
.n
[d
])
3481 sfree_aligned(new->grid
.grid
);
3482 new->grid
.grid
= old
->grid
.grid
;
3484 if (new->grid_th
!= NULL
&& new->nthread
== old
->nthread
)
3486 sfree_aligned(new->grid_all
);
3487 for (t
= 0; t
< new->nthread
; t
++)
3489 new->grid_th
[t
].grid
= old
->grid_th
[t
].grid
;
3494 int gmx_pme_reinit(gmx_pme_t
* pmedata
,
3497 const t_inputrec
* ir
,
3505 irc
.nkx
= grid_size
[XX
];
3506 irc
.nky
= grid_size
[YY
];
3507 irc
.nkz
= grid_size
[ZZ
];
3509 if (pme_src
->nnodes
== 1)
3511 homenr
= pme_src
->atc
[0].n
;
3518 ret
= gmx_pme_init(pmedata
, cr
, pme_src
->nnodes_major
, pme_src
->nnodes_minor
,
3519 &irc
, homenr
, pme_src
->bFEP
, FALSE
, pme_src
->nthread
);
3523 /* We can easily reuse the allocated pme grids in pme_src */
3524 reuse_pmegrids(&pme_src
->pmegridA
, &(*pmedata
)->pmegridA
);
3525 /* We would like to reuse the fft grids, but that's harder */
3532 static void copy_local_grid(gmx_pme_t pme
,
3533 pmegrids_t
*pmegrids
, int thread
, real
*fftgrid
)
3535 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
3539 int offx
, offy
, offz
, x
, y
, z
, i0
, i0t
;
3544 gmx_parallel_3dfft_real_limits(pme
->pfft_setupA
,
3548 fft_my
= local_fft_size
[YY
];
3549 fft_mz
= local_fft_size
[ZZ
];
3551 pmegrid
= &pmegrids
->grid_th
[thread
];
3553 nsx
= pmegrid
->s
[XX
];
3554 nsy
= pmegrid
->s
[YY
];
3555 nsz
= pmegrid
->s
[ZZ
];
3557 for (d
= 0; d
< DIM
; d
++)
3559 nf
[d
] = min(pmegrid
->n
[d
] - (pmegrid
->order
- 1),
3560 local_fft_ndata
[d
] - pmegrid
->offset
[d
]);
3563 offx
= pmegrid
->offset
[XX
];
3564 offy
= pmegrid
->offset
[YY
];
3565 offz
= pmegrid
->offset
[ZZ
];
3567 /* Directly copy the non-overlapping parts of the local grids.
3568 * This also initializes the full grid.
3570 grid_th
= pmegrid
->grid
;
3571 for (x
= 0; x
< nf
[XX
]; x
++)
3573 for (y
= 0; y
< nf
[YY
]; y
++)
3575 i0
= ((offx
+ x
)*fft_my
+ (offy
+ y
))*fft_mz
+ offz
;
3576 i0t
= (x
*nsy
+ y
)*nsz
;
3577 for (z
= 0; z
< nf
[ZZ
]; z
++)
3579 fftgrid
[i0
+z
] = grid_th
[i0t
+z
];
3586 reduce_threadgrid_overlap(gmx_pme_t pme
,
3587 const pmegrids_t
*pmegrids
, int thread
,
3588 real
*fftgrid
, real
*commbuf_x
, real
*commbuf_y
)
3590 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
3591 int fft_nx
, fft_ny
, fft_nz
;
3596 int offx
, offy
, offz
, x
, y
, z
, i0
, i0t
;
3597 int sx
, sy
, sz
, fx
, fy
, fz
, tx1
, ty1
, tz1
, ox
, oy
, oz
;
3598 gmx_bool bClearBufX
, bClearBufY
, bClearBufXY
, bClearBuf
;
3599 gmx_bool bCommX
, bCommY
;
3602 const pmegrid_t
*pmegrid
, *pmegrid_g
, *pmegrid_f
;
3603 const real
*grid_th
;
3604 real
*commbuf
= NULL
;
3606 gmx_parallel_3dfft_real_limits(pme
->pfft_setupA
,
3610 fft_nx
= local_fft_ndata
[XX
];
3611 fft_ny
= local_fft_ndata
[YY
];
3612 fft_nz
= local_fft_ndata
[ZZ
];
3614 fft_my
= local_fft_size
[YY
];
3615 fft_mz
= local_fft_size
[ZZ
];
3617 /* This routine is called when all thread have finished spreading.
3618 * Here each thread sums grid contributions calculated by other threads
3619 * to the thread local grid volume.
3620 * To minimize the number of grid copying operations,
3621 * this routines sums immediately from the pmegrid to the fftgrid.
3624 /* Determine which part of the full node grid we should operate on,
3625 * this is our thread local part of the full grid.
3627 pmegrid
= &pmegrids
->grid_th
[thread
];
3629 for (d
= 0; d
< DIM
; d
++)
3631 ne
[d
] = min(pmegrid
->offset
[d
]+pmegrid
->n
[d
]-(pmegrid
->order
-1),
3632 local_fft_ndata
[d
]);
3635 offx
= pmegrid
->offset
[XX
];
3636 offy
= pmegrid
->offset
[YY
];
3637 offz
= pmegrid
->offset
[ZZ
];
3644 /* Now loop over all the thread data blocks that contribute
3645 * to the grid region we (our thread) are operating on.
3647 /* Note that ffy_nx/y is equal to the number of grid points
3648 * between the first point of our node grid and the one of the next node.
3650 for (sx
= 0; sx
>= -pmegrids
->nthread_comm
[XX
]; sx
--)
3652 fx
= pmegrid
->ci
[XX
] + sx
;
3657 fx
+= pmegrids
->nc
[XX
];
3659 bCommX
= (pme
->nnodes_major
> 1);
3661 pmegrid_g
= &pmegrids
->grid_th
[fx
*pmegrids
->nc
[YY
]*pmegrids
->nc
[ZZ
]];
3662 ox
+= pmegrid_g
->offset
[XX
];
3665 tx1
= min(ox
+ pmegrid_g
->n
[XX
], ne
[XX
]);
3669 tx1
= min(ox
+ pmegrid_g
->n
[XX
], pme
->pme_order
);
3672 for (sy
= 0; sy
>= -pmegrids
->nthread_comm
[YY
]; sy
--)
3674 fy
= pmegrid
->ci
[YY
] + sy
;
3679 fy
+= pmegrids
->nc
[YY
];
3681 bCommY
= (pme
->nnodes_minor
> 1);
3683 pmegrid_g
= &pmegrids
->grid_th
[fy
*pmegrids
->nc
[ZZ
]];
3684 oy
+= pmegrid_g
->offset
[YY
];
3687 ty1
= min(oy
+ pmegrid_g
->n
[YY
], ne
[YY
]);
3691 ty1
= min(oy
+ pmegrid_g
->n
[YY
], pme
->pme_order
);
3694 for (sz
= 0; sz
>= -pmegrids
->nthread_comm
[ZZ
]; sz
--)
3696 fz
= pmegrid
->ci
[ZZ
] + sz
;
3700 fz
+= pmegrids
->nc
[ZZ
];
3703 pmegrid_g
= &pmegrids
->grid_th
[fz
];
3704 oz
+= pmegrid_g
->offset
[ZZ
];
3705 tz1
= min(oz
+ pmegrid_g
->n
[ZZ
], ne
[ZZ
]);
3707 if (sx
== 0 && sy
== 0 && sz
== 0)
3709 /* We have already added our local contribution
3710 * before calling this routine, so skip it here.
3715 thread_f
= (fx
*pmegrids
->nc
[YY
] + fy
)*pmegrids
->nc
[ZZ
] + fz
;
3717 pmegrid_f
= &pmegrids
->grid_th
[thread_f
];
3719 grid_th
= pmegrid_f
->grid
;
3721 nsx
= pmegrid_f
->s
[XX
];
3722 nsy
= pmegrid_f
->s
[YY
];
3723 nsz
= pmegrid_f
->s
[ZZ
];
3725 #ifdef DEBUG_PME_REDUCE
3726 printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
3727 pme
->nodeid
, thread
, thread_f
,
3728 pme
->pmegrid_start_ix
,
3729 pme
->pmegrid_start_iy
,
3730 pme
->pmegrid_start_iz
,
3732 offx
-ox
, tx1
-ox
, offx
, tx1
,
3733 offy
-oy
, ty1
-oy
, offy
, ty1
,
3734 offz
-oz
, tz1
-oz
, offz
, tz1
);
3737 if (!(bCommX
|| bCommY
))
3739 /* Copy from the thread local grid to the node grid */
3740 for (x
= offx
; x
< tx1
; x
++)
3742 for (y
= offy
; y
< ty1
; y
++)
3744 i0
= (x
*fft_my
+ y
)*fft_mz
;
3745 i0t
= ((x
- ox
)*nsy
+ (y
- oy
))*nsz
- oz
;
3746 for (z
= offz
; z
< tz1
; z
++)
3748 fftgrid
[i0
+z
] += grid_th
[i0t
+z
];
3755 /* The order of this conditional decides
3756 * where the corner volume gets stored with x+y decomp.
3760 commbuf
= commbuf_y
;
3761 buf_my
= ty1
- offy
;
3764 /* We index commbuf modulo the local grid size */
3765 commbuf
+= buf_my
*fft_nx
*fft_nz
;
3767 bClearBuf
= bClearBufXY
;
3768 bClearBufXY
= FALSE
;
3772 bClearBuf
= bClearBufY
;
3778 commbuf
= commbuf_x
;
3780 bClearBuf
= bClearBufX
;
3784 /* Copy to the communication buffer */
3785 for (x
= offx
; x
< tx1
; x
++)
3787 for (y
= offy
; y
< ty1
; y
++)
3789 i0
= (x
*buf_my
+ y
)*fft_nz
;
3790 i0t
= ((x
- ox
)*nsy
+ (y
- oy
))*nsz
- oz
;
3794 /* First access of commbuf, initialize it */
3795 for (z
= offz
; z
< tz1
; z
++)
3797 commbuf
[i0
+z
] = grid_th
[i0t
+z
];
3802 for (z
= offz
; z
< tz1
; z
++)
3804 commbuf
[i0
+z
] += grid_th
[i0t
+z
];
3816 static void sum_fftgrid_dd(gmx_pme_t pme
, real
*fftgrid
)
3818 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
3819 pme_overlap_t
*overlap
;
3820 int send_index0
, send_nindex
;
3825 int send_size_y
, recv_size_y
;
3826 int ipulse
, send_id
, recv_id
, datasize
, gridsize
, size_yx
;
3827 real
*sendptr
, *recvptr
;
3828 int x
, y
, z
, indg
, indb
;
3830 /* Note that this routine is only used for forward communication.
3831 * Since the force gathering, unlike the charge spreading,
3832 * can be trivially parallelized over the particles,
3833 * the backwards process is much simpler and can use the "old"
3834 * communication setup.
3837 gmx_parallel_3dfft_real_limits(pme
->pfft_setupA
,
3842 if (pme
->nnodes_minor
> 1)
3844 /* Major dimension */
3845 overlap
= &pme
->overlap
[1];
3847 if (pme
->nnodes_major
> 1)
3849 size_yx
= pme
->overlap
[0].comm_data
[0].send_nindex
;
3855 datasize
= (local_fft_ndata
[XX
] + size_yx
)*local_fft_ndata
[ZZ
];
3857 send_size_y
= overlap
->send_size
;
3859 for (ipulse
= 0; ipulse
< overlap
->noverlap_nodes
; ipulse
++)
3861 send_id
= overlap
->send_id
[ipulse
];
3862 recv_id
= overlap
->recv_id
[ipulse
];
3864 overlap
->comm_data
[ipulse
].send_index0
-
3865 overlap
->comm_data
[0].send_index0
;
3866 send_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
3867 /* We don't use recv_index0, as we always receive starting at 0 */
3868 recv_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
3869 recv_size_y
= overlap
->comm_data
[ipulse
].recv_size
;
3871 sendptr
= overlap
->sendbuf
+ send_index0
*local_fft_ndata
[ZZ
];
3872 recvptr
= overlap
->recvbuf
;
3875 MPI_Sendrecv(sendptr
, send_size_y
*datasize
, GMX_MPI_REAL
,
3877 recvptr
, recv_size_y
*datasize
, GMX_MPI_REAL
,
3879 overlap
->mpi_comm
, &stat
);
3882 for (x
= 0; x
< local_fft_ndata
[XX
]; x
++)
3884 for (y
= 0; y
< recv_nindex
; y
++)
3886 indg
= (x
*local_fft_size
[YY
] + y
)*local_fft_size
[ZZ
];
3887 indb
= (x
*recv_size_y
+ y
)*local_fft_ndata
[ZZ
];
3888 for (z
= 0; z
< local_fft_ndata
[ZZ
]; z
++)
3890 fftgrid
[indg
+z
] += recvptr
[indb
+z
];
3895 if (pme
->nnodes_major
> 1)
3897 /* Copy from the received buffer to the send buffer for dim 0 */
3898 sendptr
= pme
->overlap
[0].sendbuf
;
3899 for (x
= 0; x
< size_yx
; x
++)
3901 for (y
= 0; y
< recv_nindex
; y
++)
3903 indg
= (x
*local_fft_ndata
[YY
] + y
)*local_fft_ndata
[ZZ
];
3904 indb
= ((local_fft_ndata
[XX
] + x
)*recv_size_y
+ y
)*local_fft_ndata
[ZZ
];
3905 for (z
= 0; z
< local_fft_ndata
[ZZ
]; z
++)
3907 sendptr
[indg
+z
] += recvptr
[indb
+z
];
3915 /* We only support a single pulse here.
3916 * This is not a severe limitation, as this code is only used
3917 * with OpenMP and with OpenMP the (PME) domains can be larger.
3919 if (pme
->nnodes_major
> 1)
3921 /* Major dimension */
3922 overlap
= &pme
->overlap
[0];
3924 datasize
= local_fft_ndata
[YY
]*local_fft_ndata
[ZZ
];
3925 gridsize
= local_fft_size
[YY
] *local_fft_size
[ZZ
];
3929 send_id
= overlap
->send_id
[ipulse
];
3930 recv_id
= overlap
->recv_id
[ipulse
];
3931 send_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
3932 /* We don't use recv_index0, as we always receive starting at 0 */
3933 recv_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
3935 sendptr
= overlap
->sendbuf
;
3936 recvptr
= overlap
->recvbuf
;
3940 fprintf(debug
, "PME fftgrid comm %2d x %2d x %2d\n",
3941 send_nindex
, local_fft_ndata
[YY
], local_fft_ndata
[ZZ
]);
3945 MPI_Sendrecv(sendptr
, send_nindex
*datasize
, GMX_MPI_REAL
,
3947 recvptr
, recv_nindex
*datasize
, GMX_MPI_REAL
,
3949 overlap
->mpi_comm
, &stat
);
3952 for (x
= 0; x
< recv_nindex
; x
++)
3954 for (y
= 0; y
< local_fft_ndata
[YY
]; y
++)
3956 indg
= (x
*local_fft_size
[YY
] + y
)*local_fft_size
[ZZ
];
3957 indb
= (x
*local_fft_ndata
[YY
] + y
)*local_fft_ndata
[ZZ
];
3958 for (z
= 0; z
< local_fft_ndata
[ZZ
]; z
++)
3960 fftgrid
[indg
+z
] += recvptr
[indb
+z
];
3968 static void spread_on_grid(gmx_pme_t pme
,
3969 pme_atomcomm_t
*atc
, pmegrids_t
*grids
,
3970 gmx_bool bCalcSplines
, gmx_bool bSpread
,
3973 int nthread
, thread
;
3974 #ifdef PME_TIME_THREADS
3975 gmx_cycles_t c1
, c2
, c3
, ct1a
, ct1b
, ct1c
;
3976 static double cs1
= 0, cs2
= 0, cs3
= 0;
3977 static double cs1a
[6] = {0, 0, 0, 0, 0, 0};
3981 nthread
= pme
->nthread
;
3982 assert(nthread
> 0);
3984 #ifdef PME_TIME_THREADS
3985 c1
= omp_cyc_start();
3989 #pragma omp parallel for num_threads(nthread) schedule(static)
3990 for (thread
= 0; thread
< nthread
; thread
++)
3994 start
= atc
->n
* thread
/nthread
;
3995 end
= atc
->n
*(thread
+1)/nthread
;
3997 /* Compute fftgrid index for all atoms,
3998 * with help of some extra variables.
4000 calc_interpolation_idx(pme
, atc
, start
, end
, thread
);
4003 #ifdef PME_TIME_THREADS
4004 c1
= omp_cyc_end(c1
);
4008 #ifdef PME_TIME_THREADS
4009 c2
= omp_cyc_start();
4011 #pragma omp parallel for num_threads(nthread) schedule(static)
4012 for (thread
= 0; thread
< nthread
; thread
++)
4014 splinedata_t
*spline
;
4015 pmegrid_t
*grid
= NULL
;
4017 /* make local bsplines */
4018 if (grids
== NULL
|| !pme
->bUseThreads
)
4020 spline
= &atc
->spline
[0];
4026 grid
= &grids
->grid
;
4031 spline
= &atc
->spline
[thread
];
4033 if (grids
->nthread
== 1)
4035 /* One thread, we operate on all charges */
4040 /* Get the indices our thread should operate on */
4041 make_thread_local_ind(atc
, thread
, spline
);
4044 grid
= &grids
->grid_th
[thread
];
4049 make_bsplines(spline
->theta
, spline
->dtheta
, pme
->pme_order
,
4050 atc
->fractx
, spline
->n
, spline
->ind
, atc
->q
, pme
->bFEP
);
4055 /* put local atoms on grid. */
4056 #ifdef PME_TIME_SPREAD
4057 ct1a
= omp_cyc_start();
4059 spread_q_bsplines_thread(grid
, atc
, spline
, pme
->spline_work
);
4061 if (pme
->bUseThreads
)
4063 copy_local_grid(pme
, grids
, thread
, fftgrid
);
4065 #ifdef PME_TIME_SPREAD
4066 ct1a
= omp_cyc_end(ct1a
);
4067 cs1a
[thread
] += (double)ct1a
;
4071 #ifdef PME_TIME_THREADS
4072 c2
= omp_cyc_end(c2
);
4076 if (bSpread
&& pme
->bUseThreads
)
4078 #ifdef PME_TIME_THREADS
4079 c3
= omp_cyc_start();
4081 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
4082 for (thread
= 0; thread
< grids
->nthread
; thread
++)
4084 reduce_threadgrid_overlap(pme
, grids
, thread
,
4086 pme
->overlap
[0].sendbuf
,
4087 pme
->overlap
[1].sendbuf
);
4089 #ifdef PME_TIME_THREADS
4090 c3
= omp_cyc_end(c3
);
4094 if (pme
->nnodes
> 1)
4096 /* Communicate the overlapping part of the fftgrid.
4097 * For this communication call we need to check pme->bUseThreads
4098 * to have all ranks communicate here, regardless of pme->nthread.
4100 sum_fftgrid_dd(pme
, fftgrid
);
4104 #ifdef PME_TIME_THREADS
4108 printf("idx %.2f spread %.2f red %.2f",
4109 cs1
*1e-9, cs2
*1e-9, cs3
*1e-9);
4110 #ifdef PME_TIME_SPREAD
4111 for (thread
= 0; thread
< nthread
; thread
++)
4113 printf(" %.2f", cs1a
[thread
]*1e-9);
4122 static void dump_grid(FILE *fp
,
4123 int sx
, int sy
, int sz
, int nx
, int ny
, int nz
,
4124 int my
, int mz
, const real
*g
)
4128 for (x
= 0; x
< nx
; x
++)
4130 for (y
= 0; y
< ny
; y
++)
4132 for (z
= 0; z
< nz
; z
++)
4134 fprintf(fp
, "%2d %2d %2d %6.3f\n",
4135 sx
+x
, sy
+y
, sz
+z
, g
[(x
*my
+ y
)*mz
+ z
]);
4141 static void dump_local_fftgrid(gmx_pme_t pme
, const real
*fftgrid
)
4143 ivec local_fft_ndata
, local_fft_offset
, local_fft_size
;
4145 gmx_parallel_3dfft_real_limits(pme
->pfft_setupA
,
4151 pme
->pmegrid_start_ix
,
4152 pme
->pmegrid_start_iy
,
4153 pme
->pmegrid_start_iz
,
4154 pme
->pmegrid_nx
-pme
->pme_order
+1,
4155 pme
->pmegrid_ny
-pme
->pme_order
+1,
4156 pme
->pmegrid_nz
-pme
->pme_order
+1,
4163 void gmx_pme_calc_energy(gmx_pme_t pme
, int n
, rvec
*x
, real
*q
, real
*V
)
4165 pme_atomcomm_t
*atc
;
4168 if (pme
->nnodes
> 1)
4170 gmx_incons("gmx_pme_calc_energy called in parallel");
4174 gmx_incons("gmx_pme_calc_energy with free energy");
4177 atc
= &pme
->atc_energy
;
4179 if (atc
->spline
== NULL
)
4181 snew(atc
->spline
, atc
->nthread
);
4184 atc
->bSpread
= TRUE
;
4185 atc
->pme_order
= pme
->pme_order
;
4187 pme_realloc_atomcomm_things(atc
);
4191 /* We only use the A-charges grid */
4192 grid
= &pme
->pmegridA
;
4194 /* Only calculate the spline coefficients, don't actually spread */
4195 spread_on_grid(pme
, atc
, NULL
, TRUE
, FALSE
, pme
->fftgridA
);
4197 *V
= gather_energy_bsplines(pme
, grid
->grid
.grid
, atc
);
4201 static void reset_pmeonly_counters(t_commrec
*cr
, gmx_wallcycle_t wcycle
,
4202 gmx_runtime_t
*runtime
,
4203 t_nrnb
*nrnb
, t_inputrec
*ir
,
4204 gmx_large_int_t step
)
4206 /* Reset all the counters related to performance over the run */
4207 wallcycle_stop(wcycle
, ewcRUN
);
4208 wallcycle_reset_all(wcycle
);
4210 if (ir
->nsteps
>= 0)
4212 /* ir->nsteps is not used here, but we update it for consistency */
4213 ir
->nsteps
-= step
- ir
->init_step
;
4215 ir
->init_step
= step
;
4216 wallcycle_start(wcycle
, ewcRUN
);
4217 runtime_start(runtime
);
4221 static void gmx_pmeonly_switch(int *npmedata
, gmx_pme_t
**pmedata
,
4223 t_commrec
*cr
, t_inputrec
*ir
,
4227 gmx_pme_t pme
= NULL
;
4230 while (ind
< *npmedata
)
4232 pme
= (*pmedata
)[ind
];
4233 if (pme
->nkx
== grid_size
[XX
] &&
4234 pme
->nky
== grid_size
[YY
] &&
4235 pme
->nkz
== grid_size
[ZZ
])
4246 srenew(*pmedata
, *npmedata
);
4248 /* Generate a new PME data structure, copying part of the old pointers */
4249 gmx_pme_reinit(&((*pmedata
)[ind
]), cr
, pme
, ir
, grid_size
);
4251 *pme_ret
= (*pmedata
)[ind
];
4255 int gmx_pmeonly(gmx_pme_t pme
,
4256 t_commrec
*cr
, t_nrnb
*nrnb
,
4257 gmx_wallcycle_t wcycle
,
4258 gmx_runtime_t
*runtime
,
4259 real ewaldcoeff
, gmx_bool bGatherOnly
,
4264 gmx_pme_pp_t pme_pp
;
4268 rvec
*x_pp
= NULL
, *f_pp
= NULL
;
4269 real
*chargeA
= NULL
, *chargeB
= NULL
;
4271 int maxshift_x
= 0, maxshift_y
= 0;
4272 real energy
, dvdlambda
;
4277 gmx_large_int_t step
, step_rel
;
4280 /* This data will only use with PME tuning, i.e. switching PME grids */
4282 snew(pmedata
, npmedata
);
4285 pme_pp
= gmx_pme_pp_init(cr
);
4290 do /****** this is a quasi-loop over time steps! */
4292 /* The reason for having a loop here is PME grid tuning/switching */
4295 /* Domain decomposition */
4296 ret
= gmx_pme_recv_q_x(pme_pp
,
4298 &chargeA
, &chargeB
, box
, &x_pp
, &f_pp
,
4299 &maxshift_x
, &maxshift_y
,
4300 &pme
->bFEP
, &lambda
,
4303 grid_switch
, &ewaldcoeff
);
4305 if (ret
== pmerecvqxSWITCHGRID
)
4307 /* Switch the PME grid to grid_switch */
4308 gmx_pmeonly_switch(&npmedata
, &pmedata
, grid_switch
, cr
, ir
, &pme
);
4311 if (ret
== pmerecvqxRESETCOUNTERS
)
4313 /* Reset the cycle and flop counters */
4314 reset_pmeonly_counters(cr
, wcycle
, runtime
, nrnb
, ir
, step
);
4317 while (ret
== pmerecvqxSWITCHGRID
|| ret
== pmerecvqxRESETCOUNTERS
);
4319 if (ret
== pmerecvqxFINISH
)
4321 /* We should stop: break out of the loop */
4325 step_rel
= step
- ir
->init_step
;
4329 wallcycle_start(wcycle
, ewcRUN
);
4330 runtime_start(runtime
);
4333 wallcycle_start(wcycle
, ewcPMEMESH
);
4337 gmx_pme_do(pme
, 0, natoms
, x_pp
, f_pp
, chargeA
, chargeB
, box
,
4338 cr
, maxshift_x
, maxshift_y
, nrnb
, wcycle
, vir
, ewaldcoeff
,
4339 &energy
, lambda
, &dvdlambda
,
4340 GMX_PME_DO_ALL_F
| (bEnerVir
? GMX_PME_CALC_ENER_VIR
: 0));
4342 cycles
= wallcycle_stop(wcycle
, ewcPMEMESH
);
4344 gmx_pme_send_force_vir_ener(pme_pp
,
4345 f_pp
, vir
, energy
, dvdlambda
,
4349 } /***** end of quasi-loop, we stop with the break above */
4352 runtime_end(runtime
);
4357 int gmx_pme_do(gmx_pme_t pme
,
4358 int start
, int homenr
,
4360 real
*chargeA
, real
*chargeB
,
4361 matrix box
, t_commrec
*cr
,
4362 int maxshift_x
, int maxshift_y
,
4363 t_nrnb
*nrnb
, gmx_wallcycle_t wcycle
,
4364 matrix vir
, real ewaldcoeff
,
4365 real
*energy
, real lambda
,
4366 real
*dvdlambda
, int flags
)
4368 int q
, d
, i
, j
, ntot
, npme
;
4371 pme_atomcomm_t
*atc
= NULL
;
4372 pmegrids_t
*pmegrid
= NULL
;
4376 real
*charge
= NULL
, *q_d
;
4380 gmx_parallel_3dfft_t pfft_setup
;
4382 t_complex
* cfftgrid
;
4384 const gmx_bool bCalcEnerVir
= flags
& GMX_PME_CALC_ENER_VIR
;
4385 const gmx_bool bCalcF
= flags
& GMX_PME_CALC_F
;
4387 assert(pme
->nnodes
> 0);
4388 assert(pme
->nnodes
== 1 || pme
->ndecompdim
> 0);
4390 if (pme
->nnodes
> 1)
4394 if (atc
->npd
> atc
->pd_nalloc
)
4396 atc
->pd_nalloc
= over_alloc_dd(atc
->npd
);
4397 srenew(atc
->pd
, atc
->pd_nalloc
);
4399 atc
->maxshift
= (atc
->dimind
== 0 ? maxshift_x
: maxshift_y
);
4403 /* This could be necessary for TPI */
4404 pme
->atc
[0].n
= homenr
;
4407 for (q
= 0; q
< (pme
->bFEP
? 2 : 1); q
++)
4411 pmegrid
= &pme
->pmegridA
;
4412 fftgrid
= pme
->fftgridA
;
4413 cfftgrid
= pme
->cfftgridA
;
4414 pfft_setup
= pme
->pfft_setupA
;
4415 charge
= chargeA
+start
;
4419 pmegrid
= &pme
->pmegridB
;
4420 fftgrid
= pme
->fftgridB
;
4421 cfftgrid
= pme
->cfftgridB
;
4422 pfft_setup
= pme
->pfft_setupB
;
4423 charge
= chargeB
+start
;
4425 grid
= pmegrid
->grid
.grid
;
4426 /* Unpack structure */
4429 fprintf(debug
, "PME: nnodes = %d, nodeid = %d\n",
4430 cr
->nnodes
, cr
->nodeid
);
4431 fprintf(debug
, "Grid = %p\n", (void*)grid
);
4434 gmx_fatal(FARGS
, "No grid!");
4439 m_inv_ur0(box
, pme
->recipbox
);
4441 if (pme
->nnodes
== 1)
4444 if (DOMAINDECOMP(cr
))
4447 pme_realloc_atomcomm_things(atc
);
4455 wallcycle_start(wcycle
, ewcPME_REDISTXF
);
4456 for (d
= pme
->ndecompdim
-1; d
>= 0; d
--)
4458 if (d
== pme
->ndecompdim
-1)
4466 n_d
= pme
->atc
[d
+1].n
;
4472 if (atc
->npd
> atc
->pd_nalloc
)
4474 atc
->pd_nalloc
= over_alloc_dd(atc
->npd
);
4475 srenew(atc
->pd
, atc
->pd_nalloc
);
4477 atc
->maxshift
= (atc
->dimind
== 0 ? maxshift_x
: maxshift_y
);
4478 pme_calc_pidx_wrapper(n_d
, pme
->recipbox
, x_d
, atc
);
4481 GMX_BARRIER(cr
->mpi_comm_mygroup
);
4482 /* Redistribute x (only once) and qA or qB */
4483 if (DOMAINDECOMP(cr
))
4485 dd_pmeredist_x_q(pme
, n_d
, q
== 0, x_d
, q_d
, atc
);
4489 pmeredist_pd(pme
, TRUE
, n_d
, q
== 0, x_d
, q_d
, atc
);
4494 wallcycle_stop(wcycle
, ewcPME_REDISTXF
);
4499 fprintf(debug
, "Node= %6d, pme local particles=%6d\n",
4500 cr
->nodeid
, atc
->n
);
4503 if (flags
& GMX_PME_SPREAD_Q
)
4505 wallcycle_start(wcycle
, ewcPME_SPREADGATHER
);
4507 /* Spread the charges on a grid */
4508 GMX_MPE_LOG(ev_spread_on_grid_start
);
4510 /* Spread the charges on a grid */
4511 spread_on_grid(pme
, &pme
->atc
[0], pmegrid
, q
== 0, TRUE
, fftgrid
);
4512 GMX_MPE_LOG(ev_spread_on_grid_finish
);
4516 inc_nrnb(nrnb
, eNR_WEIGHTS
, DIM
*atc
->n
);
4518 inc_nrnb(nrnb
, eNR_SPREADQBSP
,
4519 pme
->pme_order
*pme
->pme_order
*pme
->pme_order
*atc
->n
);
4521 if (!pme
->bUseThreads
)
4523 wrap_periodic_pmegrid(pme
, grid
);
4525 /* sum contributions to local grid from other nodes */
4527 if (pme
->nnodes
> 1)
4529 GMX_BARRIER(cr
->mpi_comm_mygroup
);
4530 gmx_sum_qgrid_dd(pme
, grid
, GMX_SUM_QGRID_FORWARD
);
4535 copy_pmegrid_to_fftgrid(pme
, grid
, fftgrid
);
4538 wallcycle_stop(wcycle
, ewcPME_SPREADGATHER
);
4541 dump_local_fftgrid(pme,fftgrid);
4546 /* Here we start a large thread parallel region */
4547 #pragma omp parallel num_threads(pme->nthread) private(thread)
4549 thread
= gmx_omp_get_thread_num();
4550 if (flags
& GMX_PME_SOLVE
)
4557 GMX_BARRIER(cr
->mpi_comm_mygroup
);
4558 GMX_MPE_LOG(ev_gmxfft3d_start
);
4559 wallcycle_start(wcycle
, ewcPME_FFT
);
4561 gmx_parallel_3dfft_execute(pfft_setup
, GMX_FFT_REAL_TO_COMPLEX
,
4562 fftgrid
, cfftgrid
, thread
, wcycle
);
4565 wallcycle_stop(wcycle
, ewcPME_FFT
);
4566 GMX_MPE_LOG(ev_gmxfft3d_finish
);
4570 /* solve in k-space for our local cells */
4573 GMX_BARRIER(cr
->mpi_comm_mygroup
);
4574 GMX_MPE_LOG(ev_solve_pme_start
);
4575 wallcycle_start(wcycle
, ewcPME_SOLVE
);
4578 solve_pme_yzx(pme
, cfftgrid
, ewaldcoeff
,
4579 box
[XX
][XX
]*box
[YY
][YY
]*box
[ZZ
][ZZ
],
4581 pme
->nthread
, thread
);
4584 wallcycle_stop(wcycle
, ewcPME_SOLVE
);
4586 GMX_MPE_LOG(ev_solve_pme_finish
);
4587 inc_nrnb(nrnb
, eNR_SOLVEPME
, loop_count
);
4596 GMX_BARRIER(cr
->mpi_comm_mygroup
);
4597 GMX_MPE_LOG(ev_gmxfft3d_start
);
4599 wallcycle_start(wcycle
, ewcPME_FFT
);
4601 gmx_parallel_3dfft_execute(pfft_setup
, GMX_FFT_COMPLEX_TO_REAL
,
4602 cfftgrid
, fftgrid
, thread
, wcycle
);
4605 wallcycle_stop(wcycle
, ewcPME_FFT
);
4608 GMX_MPE_LOG(ev_gmxfft3d_finish
);
4610 if (pme
->nodeid
== 0)
4612 ntot
= pme
->nkx
*pme
->nky
*pme
->nkz
;
4613 npme
= ntot
*log((real
)ntot
)/log(2.0);
4614 inc_nrnb(nrnb
, eNR_FFT
, 2*npme
);
4617 wallcycle_start(wcycle
, ewcPME_SPREADGATHER
);
4620 copy_fftgrid_to_pmegrid(pme
, fftgrid
, grid
, pme
->nthread
, thread
);
4623 /* End of thread parallel section.
4624 * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
4629 /* distribute local grid to all nodes */
4631 if (pme
->nnodes
> 1)
4633 GMX_BARRIER(cr
->mpi_comm_mygroup
);
4634 gmx_sum_qgrid_dd(pme
, grid
, GMX_SUM_QGRID_BACKWARD
);
4639 unwrap_periodic_pmegrid(pme
, grid
);
4641 /* interpolate forces for our local atoms */
4642 GMX_BARRIER(cr
->mpi_comm_mygroup
);
4643 GMX_MPE_LOG(ev_gather_f_bsplines_start
);
4647 /* If we are running without parallelization,
4648 * atc->f is the actual force array, not a buffer,
4649 * therefore we should not clear it.
4651 bClearF
= (q
== 0 && PAR(cr
));
4652 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4653 for (thread
= 0; thread
< pme
->nthread
; thread
++)
4655 gather_f_bsplines(pme
, grid
, bClearF
, atc
,
4656 &atc
->spline
[thread
],
4657 pme
->bFEP
? (q
== 0 ? 1.0-lambda
: lambda
) : 1.0);
4662 GMX_MPE_LOG(ev_gather_f_bsplines_finish
);
4664 inc_nrnb(nrnb
, eNR_GATHERFBSP
,
4665 pme
->pme_order
*pme
->pme_order
*pme
->pme_order
*pme
->atc
[0].n
);
4666 wallcycle_stop(wcycle
, ewcPME_SPREADGATHER
);
4671 /* This should only be called on the master thread
4672 * and after the threads have synchronized.
4674 get_pme_ener_vir(pme
, pme
->nthread
, &energy_AB
[q
], vir_AB
[q
]);
4678 if (bCalcF
&& pme
->nnodes
> 1)
4680 wallcycle_start(wcycle
, ewcPME_REDISTXF
);
4681 for (d
= 0; d
< pme
->ndecompdim
; d
++)
4684 if (d
== pme
->ndecompdim
- 1)
4691 n_d
= pme
->atc
[d
+1].n
;
4692 f_d
= pme
->atc
[d
+1].f
;
4694 GMX_BARRIER(cr
->mpi_comm_mygroup
);
4695 if (DOMAINDECOMP(cr
))
4697 dd_pmeredist_f(pme
, atc
, n_d
, f_d
,
4698 d
== pme
->ndecompdim
-1 && pme
->bPPnode
);
4702 pmeredist_pd(pme
, FALSE
, n_d
, TRUE
, f_d
, NULL
, atc
);
4706 wallcycle_stop(wcycle
, ewcPME_REDISTXF
);
4714 *energy
= energy_AB
[0];
4715 m_add(vir
, vir_AB
[0], vir
);
4719 *energy
= (1.0-lambda
)*energy_AB
[0] + lambda
*energy_AB
[1];
4720 *dvdlambda
+= energy_AB
[1] - energy_AB
[0];
4721 for (i
= 0; i
< DIM
; i
++)
4723 for (j
= 0; j
< DIM
; j
++)
4725 vir
[i
][j
] += (1.0-lambda
)*vir_AB
[0][i
][j
] +
4726 lambda
*vir_AB
[1][i
][j
];
4738 fprintf(debug
, "PME mesh energy: %g\n", *energy
);