more fixes for TPI with PME, now really fixed bugzilla 590
[gromacs.git] / src / mdlib / pme.c
blobc30a47540a1bccb43c958d8888df921372abe09e
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 *
4 * This source code is part of
5 *
6 * G R O M A C S
7 *
8 * GROningen MAchine for Chemical Simulations
9 *
10 * VERSION 3.2.0
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
33 * And Hey:
34 * GROwing Monsters And Cloning Shrimps
36 /* IMPORTANT FOR DEVELOPERS:
38 * Triclinic pme stuff isn't entirely trivial, and we've experienced
39 * some bugs during development (many of them due to me). To avoid
40 * this in the future, please check the following things if you make
41 * changes in this file:
43 * 1. You should obtain identical (at least to the PME precision)
44 * energies, forces, and virial for
45 * a rectangular box and a triclinic one where the z (or y) axis is
46 * tilted a whole box side. For instance you could use these boxes:
48 * rectangular triclinic
49 * 2 0 0 2 0 0
50 * 0 2 0 0 2 0
51 * 0 0 6 2 2 6
53 * 2. You should check the energy conservation in a triclinic box.
55 * It might seem an overkill, but better safe than sorry.
56 * /Erik 001109
57 */
59 #ifdef HAVE_CONFIG_H
60 #include <config.h>
61 #endif
63 #ifdef GMX_LIB_MPI
64 #include <mpi.h>
65 #endif
66 #ifdef GMX_THREADS
67 #include "tmpi.h"
68 #endif
71 #include <stdio.h>
72 #include <string.h>
73 #include <math.h>
74 #include "typedefs.h"
75 #include "txtdump.h"
76 #include "vec.h"
77 #include "gmxcomplex.h"
78 #include "smalloc.h"
79 #include "futil.h"
80 #include "coulomb.h"
81 #include "gmx_fatal.h"
82 #include "pme.h"
83 #include "network.h"
84 #include "physics.h"
85 #include "nrnb.h"
86 #include "copyrite.h"
87 #include "gmx_wallcycle.h"
88 #include "gmx_parallel_3dfft.h"
89 #include "pdbio.h"
91 #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
92 #include "gmx_sse2_single.h"
93 #endif
95 #include "mpelogging.h"
97 #define DFT_TOL 1e-7
98 /* #define PRT_FORCE */
99 /* conditions for on the fly time-measurement */
100 /* #define TAKETIME (step > 1 && timesteps < 10) */
101 #define TAKETIME FALSE
103 #ifdef GMX_DOUBLE
104 #define mpi_type MPI_DOUBLE
105 #else
106 #define mpi_type MPI_FLOAT
107 #endif
109 /* Internal datastructures */
110 typedef struct {
111 int send_index0;
112 int send_nindex;
113 int recv_index0;
114 int recv_nindex;
115 } pme_grid_comm_t;
117 typedef struct {
118 #ifdef GMX_MPI
119 MPI_Comm mpi_comm;
120 #endif
121 int nnodes,nodeid;
122 int *s2g0;
123 int *s2g1;
124 int noverlap_nodes;
125 int *send_id,*recv_id;
126 pme_grid_comm_t *comm_data;
127 } pme_overlap_t;
129 typedef struct {
130 int dimind; /* The index of the dimension, 0=x, 1=y */
131 int nslab;
132 int nodeid;
133 #ifdef GMX_MPI
134 MPI_Comm mpi_comm;
135 #endif
137 int *node_dest; /* The nodes to send x and q to with DD */
138 int *node_src; /* The nodes to receive x and q from with DD */
139 int *buf_index; /* Index for commnode into the buffers */
141 int maxshift;
143 int npd;
144 int pd_nalloc;
145 int *pd;
146 int *count; /* The number of atoms to send to each node */
147 int *rcount; /* The number of atoms to receive */
149 int n;
150 int nalloc;
151 rvec *x;
152 real *q;
153 rvec *f;
154 gmx_bool bSpread; /* These coordinates are used for spreading */
155 int pme_order;
156 splinevec theta,dtheta;
157 ivec *idx;
158 rvec *fractx; /* Fractional coordinate relative to the
159 * lower cell boundary
161 } pme_atomcomm_t;
163 typedef struct gmx_pme {
164 int ndecompdim; /* The number of decomposition dimensions */
165 int nodeid; /* Our nodeid in mpi->mpi_comm */
166 int nodeid_major;
167 int nodeid_minor;
168 int nnodes; /* The number of nodes doing PME */
169 int nnodes_major;
170 int nnodes_minor;
172 MPI_Comm mpi_comm;
173 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
174 #ifdef GMX_MPI
175 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
176 #endif
178 gmx_bool bPPnode; /* Node also does particle-particle forces */
179 gmx_bool bFEP; /* Compute Free energy contribution */
180 int nkx,nky,nkz; /* Grid dimensions */
181 int pme_order;
182 real epsilon_r;
184 real * pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
185 real * pmegridB;
186 int pmegrid_nx,pmegrid_ny,pmegrid_nz;
187 int pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
189 real * pmegrid_sendbuf;
190 real * pmegrid_recvbuf;
192 real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
193 real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
194 int fftgrid_nx,fftgrid_ny,fftgrid_nz;
196 t_complex *cfftgridA; /* Grids for complex FFT data */
197 t_complex *cfftgridB;
198 int cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
200 gmx_parallel_3dfft_t pfft_setupA;
201 gmx_parallel_3dfft_t pfft_setupB;
203 int *nnx,*nny,*nnz;
204 real *fshx,*fshy,*fshz;
206 pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
207 matrix recipbox;
208 splinevec bsp_mod;
210 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
213 pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
215 rvec *bufv; /* Communication buffer */
216 real *bufr; /* Communication buffer */
217 int buf_nalloc; /* The communication buffer size */
219 /* work data for solve_pme */
220 int work_nalloc;
221 real * work_mhx;
222 real * work_mhy;
223 real * work_mhz;
224 real * work_m2;
225 real * work_denom;
226 real * work_tmp1_alloc;
227 real * work_tmp1;
228 real * work_m2inv;
230 /* Work data for PME_redist */
231 gmx_bool redist_init;
232 int * scounts;
233 int * rcounts;
234 int * sdispls;
235 int * rdispls;
236 int * sidx;
237 int * idxa;
238 real * redist_buf;
239 int redist_buf_nalloc;
241 /* Work data for sum_qgrid */
242 real * sum_qgrid_tmp;
243 real * sum_qgrid_dd_tmp;
244 } t_gmx_pme;
247 static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc)
249 int i;
250 int *idxptr,tix,tiy,tiz;
251 real *xptr,*fptr,tx,ty,tz;
252 real rxx,ryx,ryy,rzx,rzy,rzz;
253 int nx,ny,nz;
254 int start_ix,start_iy,start_iz;
256 nx = pme->nkx;
257 ny = pme->nky;
258 nz = pme->nkz;
260 start_ix = pme->pmegrid_start_ix;
261 start_iy = pme->pmegrid_start_iy;
262 start_iz = pme->pmegrid_start_iz;
264 rxx = pme->recipbox[XX][XX];
265 ryx = pme->recipbox[YY][XX];
266 ryy = pme->recipbox[YY][YY];
267 rzx = pme->recipbox[ZZ][XX];
268 rzy = pme->recipbox[ZZ][YY];
269 rzz = pme->recipbox[ZZ][ZZ];
271 for(i=0; (i<atc->n); i++) {
272 xptr = atc->x[i];
273 idxptr = atc->idx[i];
274 fptr = atc->fractx[i];
276 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
277 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
278 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
279 tz = nz * ( xptr[ZZ] * rzz + 2.0 );
281 tix = (int)(tx);
282 tiy = (int)(ty);
283 tiz = (int)(tz);
285 /* Because decomposition only occurs in x and y,
286 * we never have a fraction correction in z.
288 fptr[XX] = tx - tix + pme->fshx[tix];
289 fptr[YY] = ty - tiy + pme->fshy[tiy];
290 fptr[ZZ] = tz - tiz;
292 idxptr[XX] = pme->nnx[tix];
293 idxptr[YY] = pme->nny[tiy];
294 idxptr[ZZ] = pme->nnz[tiz];
296 #ifdef DEBUG
297 range_check(idxptr[XX],0,pme->pmegrid_nx);
298 range_check(idxptr[YY],0,pme->pmegrid_ny);
299 range_check(idxptr[ZZ],0,pme->pmegrid_nz);
300 #endif
304 static void pme_calc_pidx(int natoms, matrix recipbox, rvec x[],
305 pme_atomcomm_t *atc)
307 int nslab,i;
308 int si;
309 real *xptr,s;
310 real rxx,ryx,rzx,ryy,rzy;
311 int *pd,*count;
313 /* Calculate PME task index (pidx) for each grid index.
314 * Here we always assign equally sized slabs to each node
315 * for load balancing reasons (the PME grid spacing is not used).
318 nslab = atc->nslab;
319 pd = atc->pd;
320 count = atc->count;
322 /* Reset the count */
323 for(i=0; i<nslab; i++)
325 count[i] = 0;
328 if (atc->dimind == 0)
330 rxx = recipbox[XX][XX];
331 ryx = recipbox[YY][XX];
332 rzx = recipbox[ZZ][XX];
333 /* Calculate the node index in x-dimension */
334 for(i=0; (i<natoms); i++)
336 xptr = x[i];
337 /* Fractional coordinates along box vectors */
338 s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
339 si = (int)(s + 2*nslab) % nslab;
340 pd[i] = si;
341 count[si]++;
344 else
346 ryy = recipbox[YY][YY];
347 rzy = recipbox[ZZ][YY];
348 /* Calculate the node index in y-dimension */
349 for(i=0; (i<natoms); i++)
351 xptr = x[i];
352 /* Fractional coordinates along box vectors */
353 s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
354 si = (int)(s + 2*nslab) % nslab;
355 pd[i] = si;
356 count[si]++;
361 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
363 int nalloc_old,i;
365 if (atc->n > atc->nalloc) {
366 nalloc_old = atc->nalloc;
367 atc->nalloc = over_alloc_dd(atc->n);
369 if (atc->nslab > 1) {
370 srenew(atc->x,atc->nalloc);
371 srenew(atc->q,atc->nalloc);
372 srenew(atc->f,atc->nalloc);
373 for(i=nalloc_old; i<atc->nalloc; i++)
375 clear_rvec(atc->f[i]);
378 if (atc->bSpread) {
379 for(i=0;i<DIM;i++) {
380 srenew(atc->theta[i] ,atc->pme_order*atc->nalloc);
381 srenew(atc->dtheta[i],atc->pme_order*atc->nalloc);
383 srenew(atc->fractx,atc->nalloc);
384 srenew(atc->idx ,atc->nalloc);
389 static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
390 int n, gmx_bool bXF, rvec *x_f, real *charge,
391 pme_atomcomm_t *atc)
392 /* Redistribute particle data for PME calculation */
393 /* domain decomposition by x coordinate */
395 int *idxa;
396 int i, ii;
398 if(FALSE == pme->redist_init) {
399 snew(pme->scounts,atc->nslab);
400 snew(pme->rcounts,atc->nslab);
401 snew(pme->sdispls,atc->nslab);
402 snew(pme->rdispls,atc->nslab);
403 snew(pme->sidx,atc->nslab);
404 pme->redist_init = TRUE;
406 if (n > pme->redist_buf_nalloc) {
407 pme->redist_buf_nalloc = over_alloc_dd(n);
408 srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
411 pme->idxa = atc->pd;
413 #ifdef GMX_MPI
414 if (forw && bXF) {
415 /* forward, redistribution from pp to pme */
417 /* Calculate send counts and exchange them with other nodes */
418 for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
419 for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
420 MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
422 /* Calculate send and receive displacements and index into send
423 buffer */
424 pme->sdispls[0]=0;
425 pme->rdispls[0]=0;
426 pme->sidx[0]=0;
427 for(i=1; i<atc->nslab; i++) {
428 pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1];
429 pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1];
430 pme->sidx[i]=pme->sdispls[i];
432 /* Total # of particles to be received */
433 atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
435 pme_realloc_atomcomm_things(atc);
437 /* Copy particle coordinates into send buffer and exchange*/
438 for(i=0; (i<n); i++) {
439 ii=DIM*pme->sidx[pme->idxa[i]];
440 pme->sidx[pme->idxa[i]]++;
441 pme->redist_buf[ii+XX]=x_f[i][XX];
442 pme->redist_buf[ii+YY]=x_f[i][YY];
443 pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
445 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
446 pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
447 pme->rvec_mpi, atc->mpi_comm);
449 if (forw) {
450 /* Copy charge into send buffer and exchange*/
451 for(i=0; i<atc->nslab; i++) pme->sidx[i]=pme->sdispls[i];
452 for(i=0; (i<n); i++) {
453 ii=pme->sidx[pme->idxa[i]];
454 pme->sidx[pme->idxa[i]]++;
455 pme->redist_buf[ii]=charge[i];
457 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
458 atc->q, pme->rcounts, pme->rdispls, mpi_type,
459 atc->mpi_comm);
461 else { /* backward, redistribution from pme to pp */
462 MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
463 pme->redist_buf, pme->scounts, pme->sdispls,
464 pme->rvec_mpi, atc->mpi_comm);
466 /* Copy data from receive buffer */
467 for(i=0; i<atc->nslab; i++)
468 pme->sidx[i] = pme->sdispls[i];
469 for(i=0; (i<n); i++) {
470 ii = DIM*pme->sidx[pme->idxa[i]];
471 x_f[i][XX] += pme->redist_buf[ii+XX];
472 x_f[i][YY] += pme->redist_buf[ii+YY];
473 x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
474 pme->sidx[pme->idxa[i]]++;
477 #endif
480 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
481 gmx_bool bBackward,int shift,
482 void *buf_s,int nbyte_s,
483 void *buf_r,int nbyte_r)
485 #ifdef GMX_MPI
486 int dest,src;
487 MPI_Status stat;
489 if (bBackward == FALSE) {
490 dest = atc->node_dest[shift];
491 src = atc->node_src[shift];
492 } else {
493 dest = atc->node_src[shift];
494 src = atc->node_dest[shift];
497 if (nbyte_s > 0 && nbyte_r > 0) {
498 MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
499 dest,shift,
500 buf_r,nbyte_r,MPI_BYTE,
501 src,shift,
502 atc->mpi_comm,&stat);
503 } else if (nbyte_s > 0) {
504 MPI_Send(buf_s,nbyte_s,MPI_BYTE,
505 dest,shift,
506 atc->mpi_comm);
507 } else if (nbyte_r > 0) {
508 MPI_Recv(buf_r,nbyte_r,MPI_BYTE,
509 src,shift,
510 atc->mpi_comm,&stat);
512 #endif
515 static void dd_pmeredist_x_q(gmx_pme_t pme,
516 int n, gmx_bool bX, rvec *x, real *charge,
517 pme_atomcomm_t *atc)
519 int *commnode,*buf_index;
520 int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
522 commnode = atc->node_dest;
523 buf_index = atc->buf_index;
525 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
527 nsend = 0;
528 for(i=0; i<nnodes_comm; i++) {
529 buf_index[commnode[i]] = nsend;
530 nsend += atc->count[commnode[i]];
532 if (bX) {
533 if (atc->count[atc->nodeid] + nsend != n)
534 gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
535 "This usually means that your system is not well equilibrated.",
536 n - (atc->count[atc->nodeid] + nsend),
537 pme->nodeid,'x'+atc->dimind);
539 if (nsend > pme->buf_nalloc) {
540 pme->buf_nalloc = over_alloc_dd(nsend);
541 srenew(pme->bufv,pme->buf_nalloc);
542 srenew(pme->bufr,pme->buf_nalloc);
545 atc->n = atc->count[atc->nodeid];
546 for(i=0; i<nnodes_comm; i++) {
547 scount = atc->count[commnode[i]];
548 /* Communicate the count */
549 if (debug)
550 fprintf(debug,"dimind %d PME node %d send to node %d: %d\n",
551 atc->dimind,atc->nodeid,commnode[i],scount);
552 pme_dd_sendrecv(atc,FALSE,i,
553 &scount,sizeof(int),
554 &atc->rcount[i],sizeof(int));
555 atc->n += atc->rcount[i];
558 pme_realloc_atomcomm_things(atc);
561 local_pos = 0;
562 for(i=0; i<n; i++) {
563 node = atc->pd[i];
564 if (node == atc->nodeid) {
565 /* Copy direct to the receive buffer */
566 if (bX) {
567 copy_rvec(x[i],atc->x[local_pos]);
569 atc->q[local_pos] = charge[i];
570 local_pos++;
571 } else {
572 /* Copy to the send buffer */
573 if (bX) {
574 copy_rvec(x[i],pme->bufv[buf_index[node]]);
576 pme->bufr[buf_index[node]] = charge[i];
577 buf_index[node]++;
581 buf_pos = 0;
582 for(i=0; i<nnodes_comm; i++) {
583 scount = atc->count[commnode[i]];
584 rcount = atc->rcount[i];
585 if (scount > 0 || rcount > 0) {
586 if (bX) {
587 /* Communicate the coordinates */
588 pme_dd_sendrecv(atc,FALSE,i,
589 pme->bufv[buf_pos],scount*sizeof(rvec),
590 atc->x[local_pos],rcount*sizeof(rvec));
592 /* Communicate the charges */
593 pme_dd_sendrecv(atc,FALSE,i,
594 pme->bufr+buf_pos,scount*sizeof(real),
595 atc->q+local_pos,rcount*sizeof(real));
596 buf_pos += scount;
597 local_pos += atc->rcount[i];
602 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
603 int n, rvec *f,
604 gmx_bool bAddF)
606 int *commnode,*buf_index;
607 int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node;
609 commnode = atc->node_dest;
610 buf_index = atc->buf_index;
612 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
614 local_pos = atc->count[atc->nodeid];
615 buf_pos = 0;
616 for(i=0; i<nnodes_comm; i++) {
617 scount = atc->rcount[i];
618 rcount = atc->count[commnode[i]];
619 if (scount > 0 || rcount > 0) {
620 /* Communicate the forces */
621 pme_dd_sendrecv(atc,TRUE,i,
622 atc->f[local_pos],scount*sizeof(rvec),
623 pme->bufv[buf_pos],rcount*sizeof(rvec));
624 local_pos += scount;
626 buf_index[commnode[i]] = buf_pos;
627 buf_pos += rcount;
630 local_pos = 0;
631 if (bAddF)
633 for(i=0; i<n; i++)
635 node = atc->pd[i];
636 if (node == atc->nodeid)
638 /* Add from the local force array */
639 rvec_inc(f[i],atc->f[local_pos]);
640 local_pos++;
642 else
644 /* Add from the receive buffer */
645 rvec_inc(f[i],pme->bufv[buf_index[node]]);
646 buf_index[node]++;
650 else
652 for(i=0; i<n; i++)
654 node = atc->pd[i];
655 if (node == atc->nodeid)
657 /* Copy from the local force array */
658 copy_rvec(atc->f[local_pos],f[i]);
659 local_pos++;
661 else
663 /* Copy from the receive buffer */
664 copy_rvec(pme->bufv[buf_index[node]],f[i]);
665 buf_index[node]++;
671 #ifdef GMX_MPI
672 static void
673 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
675 pme_overlap_t *overlap;
676 int send_index0,send_nindex;
677 int recv_index0,recv_nindex;
678 MPI_Status stat;
679 int i,j,k,ix,iy,iz,icnt;
680 int ipulse,send_id,recv_id,datasize;
681 real *p;
682 real *sendptr,*recvptr;
684 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
685 overlap = &pme->overlap[1];
687 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
689 /* Since we have already (un)wrapped the overlap in the z-dimension,
690 * we only have to communicate 0 to nkz (not pmegrid_nz).
692 if (direction==GMX_SUM_QGRID_FORWARD)
694 send_id = overlap->send_id[ipulse];
695 recv_id = overlap->recv_id[ipulse];
696 send_index0 = overlap->comm_data[ipulse].send_index0;
697 send_nindex = overlap->comm_data[ipulse].send_nindex;
698 recv_index0 = overlap->comm_data[ipulse].recv_index0;
699 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
701 else
703 send_id = overlap->recv_id[ipulse];
704 recv_id = overlap->send_id[ipulse];
705 send_index0 = overlap->comm_data[ipulse].recv_index0;
706 send_nindex = overlap->comm_data[ipulse].recv_nindex;
707 recv_index0 = overlap->comm_data[ipulse].send_index0;
708 recv_nindex = overlap->comm_data[ipulse].send_nindex;
711 /* Copy data to contiguous send buffer */
712 if (debug)
714 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
715 pme->nodeid,overlap->nodeid,send_id,
716 pme->pmegrid_start_iy,
717 send_index0-pme->pmegrid_start_iy,
718 send_index0-pme->pmegrid_start_iy+send_nindex);
720 icnt = 0;
721 for(i=0;i<pme->pmegrid_nx;i++)
723 ix = i;
724 for(j=0;j<send_nindex;j++)
726 iy = j + send_index0 - pme->pmegrid_start_iy;
727 for(k=0;k<pme->nkz;k++)
729 iz = k;
730 pme->pmegrid_sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
735 datasize = pme->pmegrid_nx * pme->nkz;
737 MPI_Sendrecv(pme->pmegrid_sendbuf,send_nindex*datasize,GMX_MPI_REAL,
738 send_id,ipulse,
739 pme->pmegrid_recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
740 recv_id,ipulse,
741 overlap->mpi_comm,&stat);
743 /* Get data from contiguous recv buffer */
744 if (debug)
746 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
747 pme->nodeid,overlap->nodeid,recv_id,
748 pme->pmegrid_start_iy,
749 recv_index0-pme->pmegrid_start_iy,
750 recv_index0-pme->pmegrid_start_iy+recv_nindex);
752 icnt = 0;
753 for(i=0;i<pme->pmegrid_nx;i++)
755 ix = i;
756 for(j=0;j<recv_nindex;j++)
758 iy = j + recv_index0 - pme->pmegrid_start_iy;
759 for(k=0;k<pme->nkz;k++)
761 iz = k;
762 if(direction==GMX_SUM_QGRID_FORWARD)
764 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += pme->pmegrid_recvbuf[icnt++];
766 else
768 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = pme->pmegrid_recvbuf[icnt++];
775 /* Major dimension is easier, no copying required,
776 * but we might have to sum to separate array.
777 * Since we don't copy, we have to communicate up to pmegrid_nz,
778 * not nkz as for the minor direction.
780 overlap = &pme->overlap[0];
782 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
784 if(direction==GMX_SUM_QGRID_FORWARD)
786 send_id = overlap->send_id[ipulse];
787 recv_id = overlap->recv_id[ipulse];
788 send_index0 = overlap->comm_data[ipulse].send_index0;
789 send_nindex = overlap->comm_data[ipulse].send_nindex;
790 recv_index0 = overlap->comm_data[ipulse].recv_index0;
791 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
792 recvptr = pme->pmegrid_recvbuf;
794 else
796 send_id = overlap->recv_id[ipulse];
797 recv_id = overlap->send_id[ipulse];
798 send_index0 = overlap->comm_data[ipulse].recv_index0;
799 send_nindex = overlap->comm_data[ipulse].recv_nindex;
800 recv_index0 = overlap->comm_data[ipulse].send_index0;
801 recv_nindex = overlap->comm_data[ipulse].send_nindex;
802 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
805 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
806 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
808 if (debug)
810 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
811 pme->nodeid,overlap->nodeid,send_id,
812 pme->pmegrid_start_ix,
813 send_index0-pme->pmegrid_start_ix,
814 send_index0-pme->pmegrid_start_ix+send_nindex);
815 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
816 pme->nodeid,overlap->nodeid,recv_id,
817 pme->pmegrid_start_ix,
818 recv_index0-pme->pmegrid_start_ix,
819 recv_index0-pme->pmegrid_start_ix+recv_nindex);
822 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
823 send_id,ipulse,
824 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
825 recv_id,ipulse,
826 overlap->mpi_comm,&stat);
828 /* ADD data from contiguous recv buffer */
829 if(direction==GMX_SUM_QGRID_FORWARD)
831 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
832 for(i=0;i<recv_nindex*datasize;i++)
834 p[i] += pme->pmegrid_recvbuf[i];
839 #endif
842 static int
843 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
845 ivec local_fft_ndata,local_fft_offset,local_fft_size;
846 ivec local_pme_size;
847 int i,ix,iy,iz;
848 int pmeidx,fftidx;
850 /* Dimensions should be identical for A/B grid, so we just use A here */
851 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
852 local_fft_ndata,
853 local_fft_offset,
854 local_fft_size);
856 local_pme_size[0] = pme->pmegrid_nx;
857 local_pme_size[1] = pme->pmegrid_ny;
858 local_pme_size[2] = pme->pmegrid_nz;
860 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
861 the offset is identical, and the PME grid always has more data (due to overlap)
864 #ifdef DEBUG_PME
865 FILE *fp,*fp2;
866 char fn[STRLEN],format[STRLEN];
867 real val;
868 sprintf(fn,"pmegrid%d.pdb",pme->nodeid);
869 fp = ffopen(fn,"w");
870 sprintf(fn,"pmegrid%d.txt",pme->nodeid);
871 fp2 = ffopen(fn,"w");
872 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
873 #endif
874 for(ix=0;ix<local_fft_ndata[XX];ix++)
876 for(iy=0;iy<local_fft_ndata[YY];iy++)
878 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
880 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
881 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
882 fftgrid[fftidx] = pmegrid[pmeidx];
883 #ifdef DEBUG_PME
884 val = 100*pmegrid[pmeidx];
885 if (pmegrid[pmeidx] != 0)
886 fprintf(fp,format,"ATOM",pmeidx,"CA","GLY",' ',pmeidx,' ',
887 5.0*ix,5.0*iy,5.0*iz,1.0,val);
888 if (pmegrid[pmeidx] != 0)
889 fprintf(fp2,"%-12s %5d %5d %5d %12.5e\n",
890 "qgrid",
891 pme->pmegrid_start_ix + ix,
892 pme->pmegrid_start_iy + iy,
893 pme->pmegrid_start_iz + iz,
894 pmegrid[pmeidx]);
895 #endif
899 #ifdef DEBUG_PME
900 fclose(fp);
901 fclose(fp2);
902 #endif
904 return 0;
908 static int
909 copy_fftgrid_to_pmegrid(gmx_pme_t pme, real *fftgrid, real *pmegrid)
911 ivec local_fft_ndata,local_fft_offset,local_fft_size;
912 ivec local_pme_size;
913 int i,ix,iy,iz;
914 int pmeidx,fftidx;
916 /* Dimensions should be identical for A/B grid, so we just use A here */
917 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
918 local_fft_ndata,
919 local_fft_offset,
920 local_fft_size);
922 local_pme_size[0] = pme->pmegrid_nx;
923 local_pme_size[1] = pme->pmegrid_ny;
924 local_pme_size[2] = pme->pmegrid_nz;
926 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
927 the offset is identical, and the PME grid always has more data (due to overlap)
929 for(ix=0;ix<local_fft_ndata[XX];ix++)
931 for(iy=0;iy<local_fft_ndata[YY];iy++)
933 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
935 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
936 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
937 pmegrid[pmeidx] = fftgrid[fftidx];
941 return 0;
945 static void
946 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
948 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
950 nx = pme->nkx;
951 ny = pme->nky;
952 nz = pme->nkz;
954 pnx = pme->pmegrid_nx;
955 pny = pme->pmegrid_ny;
956 pnz = pme->pmegrid_nz;
958 overlap = pme->pme_order - 1;
960 /* Add periodic overlap in z */
961 for(ix=0; ix<pnx; ix++)
963 for(iy=0; iy<pny; iy++)
965 for(iz=0; iz<overlap; iz++)
967 pmegrid[(ix*pny+iy)*pnz+iz] +=
968 pmegrid[(ix*pny+iy)*pnz+nz+iz];
973 if (pme->nnodes_minor == 1)
975 for(ix=0; ix<pnx; ix++)
977 for(iy=0; iy<overlap; iy++)
979 for(iz=0; iz<nz; iz++)
981 pmegrid[(ix*pny+iy)*pnz+iz] +=
982 pmegrid[(ix*pny+ny+iy)*pnz+iz];
988 if (pme->nnodes_major == 1)
990 ny_x = (pme->nnodes_minor == 1 ? ny : pny);
992 for(ix=0; ix<overlap; ix++)
994 for(iy=0; iy<ny_x; iy++)
996 for(iz=0; iz<nz; iz++)
998 pmegrid[(ix*pny+iy)*pnz+iz] +=
999 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1007 static void
1008 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1010 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
1012 nx = pme->nkx;
1013 ny = pme->nky;
1014 nz = pme->nkz;
1016 pnx = pme->pmegrid_nx;
1017 pny = pme->pmegrid_ny;
1018 pnz = pme->pmegrid_nz;
1020 overlap = pme->pme_order - 1;
1022 if (pme->nnodes_major == 1)
1024 ny_x = (pme->nnodes_minor == 1 ? ny : pny);
1026 for(ix=0; ix<overlap; ix++)
1028 for(iy=0; iy<ny_x; iy++)
1030 for(iz=0; iz<nz; iz++)
1032 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1033 pmegrid[(ix*pny+iy)*pnz+iz];
1039 if (pme->nnodes_minor == 1)
1041 for(ix=0; ix<pnx; ix++)
1043 for(iy=0; iy<overlap; iy++)
1045 for(iz=0; iz<nz; iz++)
1047 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1048 pmegrid[(ix*pny+iy)*pnz+iz];
1054 /* Copy periodic overlap in z */
1055 for(ix=0; ix<pnx; ix++)
1057 for(iy=0; iy<pny; iy++)
1059 for(iz=0; iz<overlap; iz++)
1061 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1062 pmegrid[(ix*pny+iy)*pnz+iz];
1069 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1070 #define DO_BSPLINE(order) \
1071 for(ithx=0; (ithx<order); ithx++) \
1073 index_x = (i0+ithx)*pny*pnz; \
1074 valx = qn*thx[ithx]; \
1076 for(ithy=0; (ithy<order); ithy++) \
1078 valxy = valx*thy[ithy]; \
1079 index_xy = index_x+(j0+ithy)*pnz; \
1081 for(ithz=0; (ithz<order); ithz++) \
1083 index_xyz = index_xy+(k0+ithz); \
1084 grid[index_xyz] += valxy*thz[ithz]; \
1090 static void spread_q_bsplines(gmx_pme_t pme, pme_atomcomm_t *atc,
1091 real *grid)
1094 /* spread charges from home atoms to local grid */
1095 pme_overlap_t *ol;
1096 int b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
1097 int * idxptr;
1098 int order,norder,index_x,index_xy,index_xyz;
1099 real valx,valxy,qn;
1100 real *thx,*thy,*thz;
1101 int localsize, bndsize;
1103 int pnx,pny,pnz,ndatatot;
1105 pnx = pme->pmegrid_nx;
1106 pny = pme->pmegrid_ny;
1107 pnz = pme->pmegrid_nz;
1108 ndatatot = pnx*pny*pnz;
1110 for(i=0;i<ndatatot;i++)
1112 grid[i] = 0;
1115 order = pme->pme_order;
1117 for(nn=0; (nn<atc->n);nn++)
1119 n = nn;
1120 qn = atc->q[n];
1122 if (qn != 0)
1124 idxptr = atc->idx[n];
1125 norder = n*order;
1127 i0 = idxptr[XX];
1128 j0 = idxptr[YY];
1129 k0 = idxptr[ZZ];
1130 thx = atc->theta[XX] + norder;
1131 thy = atc->theta[YY] + norder;
1132 thz = atc->theta[ZZ] + norder;
1134 switch (order) {
1135 case 4: DO_BSPLINE(4); break;
1136 case 5: DO_BSPLINE(5); break;
1137 default: DO_BSPLINE(order); break;
1144 #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
1145 /* Calculate exponentials through SSE in float precision */
1146 #define CALC_EXPONENTIALS(start,end,r_aligned) \
1148 __m128 tmp_sse; \
1149 for(kx=0; kx<end; kx+=4) \
1151 tmp_sse = _mm_load_ps(r_aligned+kx); \
1152 tmp_sse = gmx_mm_exp_ps(tmp_sse); \
1153 _mm_store_ps(r_aligned+kx,tmp_sse); \
1156 #else
1157 #define CALC_EXPONENTIALS(start,end,r) \
1158 for(kx=start; kx<end; kx++) \
1160 r[kx] = exp(r[kx]); \
1162 #endif
1165 static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
1166 real ewaldcoeff,real vol,
1167 gmx_bool bEnerVir,real *mesh_energy,matrix vir)
1169 /* do recip sum over local cells in grid */
1170 /* y major, z middle, x minor or continuous */
1171 t_complex *p0;
1172 int kx,ky,kz,maxkx,maxky,maxkz;
1173 int nx,ny,nz,iy,iz,kxstart,kxend;
1174 real mx,my,mz;
1175 real factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1176 real ets2,struct2,vfactor,ets2vf;
1177 real eterm,d1,d2,energy=0;
1178 real by,bz;
1179 real virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
1180 real rxx,ryx,ryy,rzx,rzy,rzz;
1181 real *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*m2inv;
1182 real mhxk,mhyk,mhzk,m2k;
1183 real corner_fac;
1184 ivec complex_order;
1185 ivec local_ndata,local_offset,local_size;
1187 nx = pme->nkx;
1188 ny = pme->nky;
1189 nz = pme->nkz;
1191 /* Dimensions should be identical for A/B grid, so we just use A here */
1192 gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1193 complex_order,
1194 local_ndata,
1195 local_offset,
1196 local_size);
1198 rxx = pme->recipbox[XX][XX];
1199 ryx = pme->recipbox[YY][XX];
1200 ryy = pme->recipbox[YY][YY];
1201 rzx = pme->recipbox[ZZ][XX];
1202 rzy = pme->recipbox[ZZ][YY];
1203 rzz = pme->recipbox[ZZ][ZZ];
1205 maxkx = (nx+1)/2;
1206 maxky = (ny+1)/2;
1207 maxkz = nz/2+1;
1209 mhx = pme->work_mhx;
1210 mhy = pme->work_mhy;
1211 mhz = pme->work_mhz;
1212 m2 = pme->work_m2;
1213 denom = pme->work_denom;
1214 tmp1 = pme->work_tmp1;
1215 m2inv = pme->work_m2inv;
1217 for(iy=0;iy<local_ndata[YY];iy++)
1219 ky = iy + local_offset[YY];
1221 if (ky < maxky)
1223 my = ky;
1225 else
1227 my = (ky - ny);
1230 by = M_PI*vol*pme->bsp_mod[YY][ky];
1232 for(iz=0;iz<local_ndata[ZZ];iz++)
1234 kz = iz + local_offset[ZZ];
1236 mz = kz;
1238 bz = pme->bsp_mod[ZZ][kz];
1240 /* 0.5 correction for corner points */
1241 corner_fac = 1;
1242 if (kz == 0)
1243 corner_fac = 0.5;
1244 if (kz == (nz+1)/2)
1245 corner_fac = 0.5;
1247 p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
1249 /* We should skip the k-space point (0,0,0) */
1250 if (local_offset[XX] > 0 ||
1251 local_offset[YY] > 0 || ky > 0 ||
1252 kz > 0)
1254 kxstart = local_offset[XX];
1256 else
1258 kxstart = local_offset[XX] + 1;
1259 p0++;
1261 kxend = local_offset[XX] + local_ndata[XX];
1263 if (bEnerVir)
1265 /* More expensive inner loop, especially because of the storage
1266 * of the mh elements in array's.
1267 * Because x is the minor grid index, all mh elements
1268 * depend on kx for triclinic unit cells.
1271 /* Two explicit loops to avoid a conditional inside the loop */
1272 for(kx=kxstart; kx<maxkx; kx++)
1274 mx = kx;
1276 mhxk = mx * rxx;
1277 mhyk = mx * ryx + my * ryy;
1278 mhzk = mx * rzx + my * rzy + mz * rzz;
1279 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1280 mhx[kx] = mhxk;
1281 mhy[kx] = mhyk;
1282 mhz[kx] = mhzk;
1283 m2[kx] = m2k;
1284 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1285 tmp1[kx] = -factor*m2k;
1288 for(kx=maxkx; kx<kxend; kx++)
1290 mx = (kx - nx);
1292 mhxk = mx * rxx;
1293 mhyk = mx * ryx + my * ryy;
1294 mhzk = mx * rzx + my * rzy + mz * rzz;
1295 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1296 mhx[kx] = mhxk;
1297 mhy[kx] = mhyk;
1298 mhz[kx] = mhzk;
1299 m2[kx] = m2k;
1300 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1301 tmp1[kx] = -factor*m2k;
1304 for(kx=kxstart; kx<kxend; kx++)
1306 m2inv[kx] = 1.0/m2[kx];
1308 for(kx=kxstart; kx<kxend; kx++)
1310 denom[kx] = 1.0/denom[kx];
1313 CALC_EXPONENTIALS(kxstart,kxend,tmp1);
1315 for(kx=kxstart; kx<kxend; kx++,p0++)
1317 d1 = p0->re;
1318 d2 = p0->im;
1320 eterm = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
1322 p0->re = d1*eterm;
1323 p0->im = d2*eterm;
1325 struct2 = 2.0*(d1*d1+d2*d2);
1327 tmp1[kx] = eterm*struct2;
1330 for(kx=kxstart; kx<kxend; kx++)
1332 ets2 = corner_fac*tmp1[kx];
1333 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
1334 energy += ets2;
1336 ets2vf = ets2*vfactor;
1337 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
1338 virxy += ets2vf*mhx[kx]*mhy[kx];
1339 virxz += ets2vf*mhx[kx]*mhz[kx];
1340 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
1341 viryz += ets2vf*mhy[kx]*mhz[kx];
1342 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
1345 else
1347 /* We don't need to calculate the energy and the virial.
1348 * In this case the triclinic overhead is small.
1351 /* Two explicit loops to avoid a conditional inside the loop */
1353 for(kx=kxstart; kx<maxkx; kx++)
1355 mx = kx;
1357 mhxk = mx * rxx;
1358 mhyk = mx * ryx + my * ryy;
1359 mhzk = mx * rzx + my * rzy + mz * rzz;
1360 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1361 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1362 tmp1[kx] = -factor*m2k;
1365 for(kx=maxkx; kx<kxend; kx++)
1367 mx = (kx - nx);
1369 mhxk = mx * rxx;
1370 mhyk = mx * ryx + my * ryy;
1371 mhzk = mx * rzx + my * rzy + mz * rzz;
1372 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1373 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1374 tmp1[kx] = -factor*m2k;
1377 for(kx=kxstart; kx<kxend; kx++)
1379 denom[kx] = 1.0/denom[kx];
1382 CALC_EXPONENTIALS(kxstart,kxend,tmp1);
1384 for(kx=kxstart; kx<kxend; kx++,p0++)
1386 d1 = p0->re;
1387 d2 = p0->im;
1389 eterm = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
1391 p0->re = d1*eterm;
1392 p0->im = d2*eterm;
1398 if (bEnerVir)
1400 /* Update virial with local values.
1401 * The virial is symmetric by definition.
1402 * this virial seems ok for isotropic scaling, but I'm
1403 * experiencing problems on semiisotropic membranes.
1404 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
1406 vir[XX][XX] = 0.25*virxx;
1407 vir[YY][YY] = 0.25*viryy;
1408 vir[ZZ][ZZ] = 0.25*virzz;
1409 vir[XX][YY] = vir[YY][XX] = 0.25*virxy;
1410 vir[XX][ZZ] = vir[ZZ][XX] = 0.25*virxz;
1411 vir[YY][ZZ] = vir[ZZ][YY] = 0.25*viryz;
1413 /* This energy should be corrected for a charged system */
1414 *mesh_energy = 0.5*energy;
1417 /* Return the loop count */
1418 return local_ndata[YY]*local_ndata[ZZ]*local_ndata[XX];
1422 #define DO_FSPLINE(order) \
1423 for(ithx=0; (ithx<order); ithx++) \
1425 index_x = (i0+ithx)*pny*pnz; \
1426 tx = thx[ithx]; \
1427 dx = dthx[ithx]; \
1429 for(ithy=0; (ithy<order); ithy++) \
1431 index_xy = index_x+(j0+ithy)*pnz; \
1432 ty = thy[ithy]; \
1433 dy = dthy[ithy]; \
1434 fxy1 = fz1 = 0; \
1436 for(ithz=0; (ithz<order); ithz++) \
1438 gval = grid[index_xy+(k0+ithz)]; \
1439 fxy1 += thz[ithz]*gval; \
1440 fz1 += dthz[ithz]*gval; \
1442 fx += dx*ty*fxy1; \
1443 fy += tx*dy*fxy1; \
1444 fz += tx*ty*fz1; \
1449 void gather_f_bsplines(gmx_pme_t pme,real *grid,
1450 gmx_bool bClearF,pme_atomcomm_t *atc,real scale)
1452 /* sum forces for local particles */
1453 int nn,n,ithx,ithy,ithz,i0,j0,k0;
1454 int index_x,index_xy;
1455 int nx,ny,nz,pnx,pny,pnz;
1456 int * idxptr;
1457 real tx,ty,dx,dy,qn;
1458 real fx,fy,fz,gval;
1459 real fxy1,fz1;
1460 real *thx,*thy,*thz,*dthx,*dthy,*dthz;
1461 int norder;
1462 real rxx,ryx,ryy,rzx,rzy,rzz;
1463 int order;
1465 order = pme->pme_order;
1466 thx = atc->theta[XX];
1467 thy = atc->theta[YY];
1468 thz = atc->theta[ZZ];
1469 dthx = atc->dtheta[XX];
1470 dthy = atc->dtheta[YY];
1471 dthz = atc->dtheta[ZZ];
1472 nx = pme->nkx;
1473 ny = pme->nky;
1474 nz = pme->nkz;
1475 pnx = pme->pmegrid_nx;
1476 pny = pme->pmegrid_ny;
1477 pnz = pme->pmegrid_nz;
1479 rxx = pme->recipbox[XX][XX];
1480 ryx = pme->recipbox[YY][XX];
1481 ryy = pme->recipbox[YY][YY];
1482 rzx = pme->recipbox[ZZ][XX];
1483 rzy = pme->recipbox[ZZ][YY];
1484 rzz = pme->recipbox[ZZ][ZZ];
1486 for(nn=0; (nn<atc->n); nn++)
1488 n = nn;
1489 qn = scale*atc->q[n];
1491 if (bClearF)
1493 atc->f[n][XX] = 0;
1494 atc->f[n][YY] = 0;
1495 atc->f[n][ZZ] = 0;
1497 if (qn != 0)
1499 fx = 0;
1500 fy = 0;
1501 fz = 0;
1502 idxptr = atc->idx[n];
1503 norder = n*order;
1505 i0 = idxptr[XX];
1506 j0 = idxptr[YY];
1507 k0 = idxptr[ZZ];
1509 /* Pointer arithmetic alert, next six statements */
1510 thx = atc->theta[XX] + norder;
1511 thy = atc->theta[YY] + norder;
1512 thz = atc->theta[ZZ] + norder;
1513 dthx = atc->dtheta[XX] + norder;
1514 dthy = atc->dtheta[YY] + norder;
1515 dthz = atc->dtheta[ZZ] + norder;
1517 switch (order) {
1518 case 4: DO_FSPLINE(4); break;
1519 case 5: DO_FSPLINE(5); break;
1520 default: DO_FSPLINE(order); break;
1523 atc->f[n][XX] += -qn*( fx*nx*rxx );
1524 atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
1525 atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
1528 /* Since the energy and not forces are interpolated
1529 * the net force might not be exactly zero.
1530 * This can be solved by also interpolating F, but
1531 * that comes at a cost.
1532 * A better hack is to remove the net force every
1533 * step, but that must be done at a higher level
1534 * since this routine doesn't see all atoms if running
1535 * in parallel. Don't know how important it is? EL 990726
1539 static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
1540 pme_atomcomm_t *atc)
1542 int n,ithx,ithy,ithz,i0,j0,k0;
1543 int index_x,index_xy;
1544 int * idxptr;
1545 real energy,pot,tx,ty,qn,gval;
1546 real *thx,*thy,*thz;
1547 int norder;
1548 int order;
1551 order = pme->pme_order;
1553 energy = 0;
1554 for(n=0; (n<atc->n); n++) {
1555 qn = atc->q[n];
1557 if (qn != 0) {
1558 idxptr = atc->idx[n];
1559 norder = n*order;
1561 i0 = idxptr[XX];
1562 j0 = idxptr[YY];
1563 k0 = idxptr[ZZ];
1565 /* Pointer arithmetic alert, next three statements */
1566 thx = atc->theta[XX] + norder;
1567 thy = atc->theta[YY] + norder;
1568 thz = atc->theta[ZZ] + norder;
1570 pot = 0;
1571 for(ithx=0; (ithx<order); ithx++)
1573 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
1574 tx = thx[ithx];
1576 for(ithy=0; (ithy<order); ithy++)
1578 index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
1579 ty = thy[ithy];
1581 for(ithz=0; (ithz<order); ithz++)
1583 gval = grid[index_xy+(k0+ithz)];
1584 pot += tx*ty*thz[ithz]*gval;
1590 energy += pot*qn;
1594 return energy;
1597 void make_bsplines(splinevec theta,splinevec dtheta,int order,
1598 rvec fractx[],int nr,real charge[],
1599 gmx_bool bFreeEnergy)
1601 /* construct splines for local atoms */
1602 int i,j,k,l;
1603 real dr,div;
1604 real *data,*ddata,*xptr;
1606 for(i=0; (i<nr); i++) {
1607 /* With free energy we do not use the charge check.
1608 * In most cases this will be more efficient than calling make_bsplines
1609 * twice, since usually more than half the particles have charges.
1611 if (bFreeEnergy || charge[i] != 0.0) {
1612 xptr = fractx[i];
1613 for(j=0; (j<DIM); j++) {
1614 dr = xptr[j];
1616 /* dr is relative offset from lower cell limit */
1617 data=&(theta[j][i*order]);
1618 data[order-1]=0;
1619 data[1]=dr;
1620 data[0]=1-dr;
1622 for(k=3; (k<order); k++) {
1623 div=1.0/(k-1.0);
1624 data[k-1]=div*dr*data[k-2];
1625 for(l=1; (l<(k-1)); l++) {
1626 data[k-l-1]=div*((dr+l)*data[k-l-2]+(k-l-dr)*
1627 data[k-l-1]);
1629 data[0]=div*(1-dr)*data[0];
1631 /* differentiate */
1632 ddata = &(dtheta[j][i*order]);
1633 ddata[0] = -data[0];
1634 for(k=1; (k<order); k++) {
1635 ddata[k]=data[k-1]-data[k];
1638 div=1.0/(order-1);
1639 data[order-1]=div*dr*data[order-2];
1640 for(l=1; (l<(order-1)); l++) {
1641 data[order-l-1]=div*((dr+l)*data[order-l-2]+
1642 (order-l-dr)*data[order-l-1]);
1644 data[0]=div*(1-dr)*data[0];
1651 void make_dft_mod(real *mod,real *data,int ndata)
1653 int i,j;
1654 real sc,ss,arg;
1656 for(i=0;i<ndata;i++) {
1657 sc=ss=0;
1658 for(j=0;j<ndata;j++) {
1659 arg=(2.0*M_PI*i*j)/ndata;
1660 sc+=data[j]*cos(arg);
1661 ss+=data[j]*sin(arg);
1663 mod[i]=sc*sc+ss*ss;
1665 for(i=0;i<ndata;i++)
1666 if(mod[i]<1e-7)
1667 mod[i]=(mod[i-1]+mod[i+1])*0.5;
1672 void make_bspline_moduli(splinevec bsp_mod,int nx,int ny,int nz,int order)
1674 int nmax=max(nx,max(ny,nz));
1675 real *data,*ddata,*bsp_data;
1676 int i,k,l;
1677 real div;
1679 snew(data,order);
1680 snew(ddata,order);
1681 snew(bsp_data,nmax);
1683 data[order-1]=0;
1684 data[1]=0;
1685 data[0]=1;
1687 for(k=3;k<order;k++) {
1688 div=1.0/(k-1.0);
1689 data[k-1]=0;
1690 for(l=1;l<(k-1);l++)
1691 data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
1692 data[0]=div*data[0];
1694 /* differentiate */
1695 ddata[0]=-data[0];
1696 for(k=1;k<order;k++)
1697 ddata[k]=data[k-1]-data[k];
1698 div=1.0/(order-1);
1699 data[order-1]=0;
1700 for(l=1;l<(order-1);l++)
1701 data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
1702 data[0]=div*data[0];
1704 for(i=0;i<nmax;i++)
1705 bsp_data[i]=0;
1706 for(i=1;i<=order;i++)
1707 bsp_data[i]=data[i-1];
1709 make_dft_mod(bsp_mod[XX],bsp_data,nx);
1710 make_dft_mod(bsp_mod[YY],bsp_data,ny);
1711 make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
1713 sfree(data);
1714 sfree(ddata);
1715 sfree(bsp_data);
1718 static void setup_coordinate_communication(pme_atomcomm_t *atc)
1720 int nslab,n,i;
1721 int fw,bw;
1723 nslab = atc->nslab;
1725 n = 0;
1726 for(i=1; i<=nslab/2; i++) {
1727 fw = (atc->nodeid + i) % nslab;
1728 bw = (atc->nodeid - i + nslab) % nslab;
1729 if (n < nslab - 1) {
1730 atc->node_dest[n] = fw;
1731 atc->node_src[n] = bw;
1732 n++;
1734 if (n < nslab - 1) {
1735 atc->node_dest[n] = bw;
1736 atc->node_src[n] = fw;
1737 n++;
1742 int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
1744 if(NULL != log)
1746 fprintf(log,"Destroying PME data structures.\n");
1749 sfree((*pmedata)->nnx);
1750 sfree((*pmedata)->nny);
1751 sfree((*pmedata)->nnz);
1753 sfree((*pmedata)->pmegridA);
1754 sfree((*pmedata)->fftgridA);
1755 sfree((*pmedata)->cfftgridA);
1756 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
1758 if((*pmedata)->pmegridB)
1760 sfree((*pmedata)->pmegridB);
1761 sfree((*pmedata)->fftgridB);
1762 sfree((*pmedata)->cfftgridB);
1763 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
1765 sfree((*pmedata)->work_mhz);
1766 sfree((*pmedata)->work_m2);
1767 sfree((*pmedata)->work_denom);
1768 sfree((*pmedata)->work_tmp1_alloc);
1769 sfree((*pmedata)->work_m2inv);
1771 sfree(*pmedata);
1772 *pmedata = NULL;
1774 return 0;
1777 static int mult_up(int n,int f)
1779 return ((n + f - 1)/f)*f;
1783 static double pme_load_imbalance(gmx_pme_t pme)
1785 int nma,nmi;
1786 double n1,n2,n3;
1788 nma = pme->nnodes_major;
1789 nmi = pme->nnodes_minor;
1791 n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz;
1792 n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky;
1793 n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx;
1795 /* pme_solve is roughly double the cost of an fft */
1797 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
1800 static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
1801 int dimind,gmx_bool bSpread)
1803 int nk,k,s;
1805 atc->dimind = dimind;
1806 atc->nslab = 1;
1807 atc->nodeid = 0;
1808 atc->pd_nalloc = 0;
1809 #ifdef GMX_MPI
1810 if (pme->nnodes > 1)
1812 atc->mpi_comm = pme->mpi_comm_d[dimind];
1813 MPI_Comm_size(atc->mpi_comm,&atc->nslab);
1814 MPI_Comm_rank(atc->mpi_comm,&atc->nodeid);
1816 if (debug)
1818 fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid);
1820 #endif
1822 atc->bSpread = bSpread;
1823 atc->pme_order = pme->pme_order;
1825 if (atc->nslab > 1)
1827 /* These three allocations are not required for particle decomp. */
1828 snew(atc->node_dest,atc->nslab);
1829 snew(atc->node_src,atc->nslab);
1830 setup_coordinate_communication(atc);
1832 snew(atc->count,atc->nslab);
1833 snew(atc->rcount,atc->nslab);
1834 snew(atc->buf_index,atc->nslab);
1838 static void
1839 init_overlap_comm(pme_overlap_t * ol,
1840 int norder,
1841 #ifdef GMX_MPI
1842 MPI_Comm comm,
1843 #endif
1844 int nnodes,
1845 int nodeid,
1846 int ndata)
1848 int lbnd,rbnd,maxlr,b,i;
1849 int exten;
1850 int nn,nk;
1851 pme_grid_comm_t *pgc;
1852 gmx_bool bCont;
1853 int fft_start,fft_end,send_index1,recv_index1;
1855 #ifdef GMX_MPI
1856 ol->mpi_comm = comm;
1857 #endif
1859 ol->nnodes = nnodes;
1860 ol->nodeid = nodeid;
1862 /* Linear translation of the PME grid wo'nt affect reciprocal space
1863 * calculations, so to optimize we only interpolate "upwards",
1864 * which also means we only have to consider overlap in one direction.
1865 * I.e., particles on this node might also be spread to grid indices
1866 * that belong to higher nodes (modulo nnodes)
1869 snew(ol->s2g0,ol->nnodes+1);
1870 snew(ol->s2g1,ol->nnodes);
1871 if (debug) { fprintf(debug,"PME slab boundaries:"); }
1872 for(i=0; i<nnodes; i++)
1874 /* s2g0 the local interpolation grid start.
1875 * s2g1 the local interpolation grid end.
1876 * Because grid overlap communication only goes forward,
1877 * the grid the slabs for fft's should be rounded down.
1879 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
1880 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
1882 if (debug)
1884 fprintf(debug," %3d %3d",ol->s2g0[i],ol->s2g1[i]);
1887 ol->s2g0[nnodes] = ndata;
1888 if (debug) { fprintf(debug,"\n"); }
1890 /* Determine with how many nodes we need to communicate the grid overlap */
1891 b = 0;
1894 b++;
1895 bCont = FALSE;
1896 for(i=0; i<nnodes; i++)
1898 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
1899 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
1901 bCont = TRUE;
1905 while (bCont && b < nnodes);
1906 ol->noverlap_nodes = b - 1;
1908 snew(ol->send_id,ol->noverlap_nodes);
1909 snew(ol->recv_id,ol->noverlap_nodes);
1910 for(b=0; b<ol->noverlap_nodes; b++)
1912 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
1913 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
1915 snew(ol->comm_data, ol->noverlap_nodes);
1917 for(b=0; b<ol->noverlap_nodes; b++)
1919 pgc = &ol->comm_data[b];
1920 /* Send */
1921 fft_start = ol->s2g0[ol->send_id[b]];
1922 fft_end = ol->s2g0[ol->send_id[b]+1];
1923 if (ol->send_id[b] < nodeid)
1925 fft_start += ndata;
1926 fft_end += ndata;
1928 send_index1 = ol->s2g1[nodeid];
1929 send_index1 = min(send_index1,fft_end);
1930 pgc->send_index0 = fft_start;
1931 pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
1933 /* We always start receiving to the first index of our slab */
1934 fft_start = ol->s2g0[ol->nodeid];
1935 fft_end = ol->s2g0[ol->nodeid+1];
1936 recv_index1 = ol->s2g1[ol->recv_id[b]];
1937 if (ol->recv_id[b] > nodeid)
1939 recv_index1 -= ndata;
1941 recv_index1 = min(recv_index1,fft_end);
1942 pgc->recv_index0 = fft_start;
1943 pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
1947 static void
1948 make_gridindex5_to_localindex(int n,int local_start,int local_range,
1949 int **global_to_local,
1950 real **fraction_shift)
1952 int i;
1953 int * gtl;
1954 real * fsh;
1956 snew(gtl,5*n);
1957 snew(fsh,5*n);
1958 for(i=0; (i<5*n); i++)
1960 /* Determine the global to local grid index */
1961 gtl[i] = (i - local_start + n) % n;
1962 /* For coordinates that fall within the local grid the fraction
1963 * is correct, we don't need to shift it.
1965 fsh[i] = 0;
1966 if (local_range < n)
1968 /* Due to rounding issues i could be 1 beyond the lower or
1969 * upper boundary of the local grid. Correct the index for this.
1970 * If we shift the index, we need to shift the fraction by
1971 * the same amount in the other direction to not affect
1972 * the weights.
1973 * Note that due to this shifting the weights at the end of
1974 * the spline might change, but that will only involve values
1975 * between zero and values close to the precision of a real,
1976 * which is anyhow the accuracy of the whole mesh calculation.
1978 /* With local_range=0 we should not change i=local_start */
1979 if (i % n != local_start)
1981 if (gtl[i] == n-1)
1983 gtl[i] = 0;
1984 fsh[i] = -1;
1986 else if (gtl[i] == local_range)
1988 gtl[i] = local_range - 1;
1989 fsh[i] = 1;
1995 *global_to_local = gtl;
1996 *fraction_shift = fsh;
1999 static void
2000 gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
2002 int nk_new;
2004 if (*nk % nnodes != 0)
2006 nk_new = nnodes*(*nk/nnodes + 1);
2008 if (2*nk_new >= 3*(*nk))
2010 gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
2011 dim,*nk,dim,nnodes,dim);
2014 if (fplog != NULL)
2016 fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
2017 dim,*nk,dim,nnodes,dim,nk_new,dim);
2020 *nk = nk_new;
2024 int gmx_pme_init(gmx_pme_t * pmedata,
2025 t_commrec * cr,
2026 int nnodes_major,
2027 int nnodes_minor,
2028 t_inputrec * ir,
2029 int homenr,
2030 gmx_bool bFreeEnergy,
2031 gmx_bool bReproducible)
2033 gmx_pme_t pme=NULL;
2035 pme_atomcomm_t *atc;
2036 int bufsizex,bufsizey,bufsize;
2037 ivec ndata;
2039 if (debug)
2040 fprintf(debug,"Creating PME data structures.\n");
2041 snew(pme,1);
2043 pme->redist_init = FALSE;
2044 pme->sum_qgrid_tmp = NULL;
2045 pme->sum_qgrid_dd_tmp = NULL;
2046 pme->buf_nalloc = 0;
2047 pme->redist_buf_nalloc = 0;
2049 pme->nnodes = 1;
2050 pme->bPPnode = TRUE;
2052 pme->nnodes_major = nnodes_major;
2053 pme->nnodes_minor = nnodes_minor;
2055 #ifdef GMX_MPI
2056 if (nnodes_major*nnodes_minor > 1 && PAR(cr))
2058 pme->mpi_comm = cr->mpi_comm_mygroup;
2060 MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
2061 MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
2062 if (pme->nnodes != nnodes_major*nnodes_minor)
2064 gmx_incons("PME node count mismatch");
2067 #endif
2069 if (pme->nnodes == 1)
2071 pme->ndecompdim = 0;
2072 pme->nodeid_major = 0;
2073 pme->nodeid_minor = 0;
2075 else
2077 if (nnodes_minor == 1)
2079 #ifdef GMX_MPI
2080 pme->mpi_comm_d[0] = pme->mpi_comm;
2081 pme->mpi_comm_d[1] = NULL;
2082 #endif
2083 pme->ndecompdim = 1;
2084 pme->nodeid_major = pme->nodeid;
2085 pme->nodeid_minor = 0;
2088 else if (nnodes_major == 1)
2090 #ifdef GMX_MPI
2091 pme->mpi_comm_d[0] = NULL;
2092 pme->mpi_comm_d[1] = pme->mpi_comm;
2093 #endif
2094 pme->ndecompdim = 1;
2095 pme->nodeid_major = 0;
2096 pme->nodeid_minor = pme->nodeid;
2098 else
2100 if (pme->nnodes % nnodes_major != 0)
2102 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
2104 pme->ndecompdim = 2;
2106 #ifdef GMX_MPI
2107 MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
2108 pme->nodeid,&pme->mpi_comm_d[0]); /* My communicator along major dimension */
2109 MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
2110 pme->nodeid,&pme->mpi_comm_d[1]); /* My communicator along minor dimension */
2112 MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
2113 MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
2114 MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
2115 MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor);
2116 #endif
2118 pme->bPPnode = (cr->duty & DUTY_PP);
2121 if (ir->ePBC == epbcSCREW)
2123 gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
2126 pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
2127 pme->nkx = ir->nkx;
2128 pme->nky = ir->nky;
2129 pme->nkz = ir->nkz;
2130 pme->pme_order = ir->pme_order;
2131 pme->epsilon_r = ir->epsilon_r;
2133 /* Currently pme.c supports only the fft5d FFT code.
2134 * Therefore the grid always needs to be divisible by nnodes.
2135 * When the old 1D code is also supported again, change this check.
2137 * This check should be done before calling gmx_pme_init
2138 * and fplog should be passed iso stderr.
2140 if (pme->ndecompdim >= 2)
2142 if (pme->ndecompdim >= 1)
2145 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
2146 'x',nnodes_major,&pme->nkx);
2147 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
2148 'y',nnodes_minor,&pme->nky);
2152 if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
2153 pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
2154 pme->nkz <= pme->pme_order)
2156 gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_order for x and/or y",pme->pme_order);
2159 if (pme->nnodes > 1) {
2160 double imbal;
2162 #ifdef GMX_MPI
2163 MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
2164 MPI_Type_commit(&(pme->rvec_mpi));
2165 #endif
2167 /* Note that the charge spreading and force gathering, which usually
2168 * takes about the same amount of time as FFT+solve_pme,
2169 * is always fully load balanced
2170 * (unless the charge distribution is inhomogeneous).
2173 imbal = pme_load_imbalance(pme);
2174 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
2176 fprintf(stderr,
2177 "\n"
2178 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
2179 " For optimal PME load balancing\n"
2180 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
2181 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
2182 "\n",
2183 (int)((imbal-1)*100 + 0.5),
2184 pme->nkx,pme->nky,pme->nnodes_major,
2185 pme->nky,pme->nkz,pme->nnodes_minor);
2189 init_overlap_comm(&pme->overlap[0],pme->pme_order,
2190 #ifdef GMX_MPI
2191 pme->mpi_comm_d[0],
2192 #endif
2193 pme->nnodes_major,pme->nodeid_major,pme->nkx);
2195 init_overlap_comm(&pme->overlap[1],pme->pme_order,
2196 #ifdef GMX_MPI
2197 pme->mpi_comm_d[1],
2198 #endif
2199 pme->nnodes_minor,pme->nodeid_minor,pme->nky);
2201 snew(pme->bsp_mod[XX],pme->nkx);
2202 snew(pme->bsp_mod[YY],pme->nky);
2203 snew(pme->bsp_mod[ZZ],pme->nkz);
2205 /* Allocate data for the interpolation grid, including overlap */
2206 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
2207 pme->overlap[0].s2g0[pme->nodeid_major];
2208 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
2209 pme->overlap[1].s2g0[pme->nodeid_minor];
2210 pme->pmegrid_nz = pme->nkz + pme->pme_order - 1;
2212 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
2213 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
2214 pme->pmegrid_start_iz = 0;
2216 make_gridindex5_to_localindex(pme->nkx,
2217 pme->pmegrid_start_ix,
2218 pme->pmegrid_nx - (pme->pme_order-1),
2219 &pme->nnx,&pme->fshx);
2220 make_gridindex5_to_localindex(pme->nky,
2221 pme->pmegrid_start_iy,
2222 pme->pmegrid_ny - (pme->pme_order-1),
2223 &pme->nny,&pme->fshy);
2224 make_gridindex5_to_localindex(pme->nkz,
2225 pme->pmegrid_start_iz,
2226 pme->pmegrid_nz - (pme->pme_order-1),
2227 &pme->nnz,&pme->fshz);
2229 snew(pme->pmegridA,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
2231 /* For non-divisible grid we need pme_order iso pme_order-1 */
2232 /* x overlap is copied in place: take padding into account.
2233 * y is always copied through a buffer: we don't need padding in z,
2234 * but we do need the overlap in x because of the communication order.
2236 bufsizex = pme->pme_order*pme->pmegrid_ny*pme->pmegrid_nz;
2237 bufsizey = pme->pme_order*pme->pmegrid_nx*pme->nkz;
2238 bufsize = (bufsizex>bufsizey) ? bufsizex : bufsizey;
2240 snew(pme->pmegrid_sendbuf,bufsize);
2241 snew(pme->pmegrid_recvbuf,bufsize);
2243 ndata[0] = pme->nkx;
2244 ndata[1] = pme->nky;
2245 ndata[2] = pme->nkz;
2247 /* This routine will allocate the grid data to fit the FFTs */
2248 gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
2249 &pme->fftgridA,&pme->cfftgridA,
2250 pme->mpi_comm_d,
2251 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
2252 bReproducible);
2254 if (bFreeEnergy)
2256 snew(pme->pmegridB,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
2257 gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
2258 &pme->fftgridB,&pme->cfftgridB,
2259 pme->mpi_comm_d,
2260 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
2261 bReproducible);
2262 } else
2264 pme->pmegridB = NULL;
2265 pme->fftgridB = NULL;
2266 pme->cfftgridB = NULL;
2269 make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
2271 /* Use atc[0] for spreading */
2272 init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
2273 if (pme->ndecompdim >= 2)
2275 init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
2278 if (pme->nnodes == 1) {
2279 pme->atc[0].n = homenr;
2280 pme_realloc_atomcomm_things(&pme->atc[0]);
2283 /* Use fft5d, order after FFT is y major, z, x minor */
2284 pme->work_nalloc = pme->nkx;
2285 snew(pme->work_mhx,pme->work_nalloc);
2286 snew(pme->work_mhy,pme->work_nalloc);
2287 snew(pme->work_mhz,pme->work_nalloc);
2288 snew(pme->work_m2,pme->work_nalloc);
2289 snew(pme->work_denom,pme->work_nalloc);
2290 /* Allocate an aligned pointer for SSE operations, including 3 extra
2291 * elements at the end since SSE operates on 4 elements at a time.
2293 snew(pme->work_tmp1_alloc,pme->work_nalloc+8);
2294 pme->work_tmp1 = (real *) (((size_t) pme->work_tmp1_alloc + 16) & (~((size_t) 15)));
2295 snew(pme->work_m2inv,pme->work_nalloc);
2297 *pmedata = pme;
2299 return 0;
2302 static void spread_on_grid(gmx_pme_t pme,
2303 pme_atomcomm_t *atc,real *grid,
2304 gmx_bool bCalcSplines,gmx_bool bSpread)
2306 if (bCalcSplines)
2309 /* Compute fftgrid index for all atoms,
2310 * with help of some extra variables.
2312 calc_interpolation_idx(pme,atc);
2314 /* make local bsplines */
2315 make_bsplines(atc->theta,atc->dtheta,pme->pme_order,
2316 atc->fractx,atc->n,atc->q,pme->bFEP);
2319 if (bSpread)
2321 /* put local atoms on grid. */
2322 spread_q_bsplines(pme,atc,grid);
2326 void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
2328 pme_atomcomm_t *atc;
2329 real *grid;
2331 if (pme->nnodes > 1)
2333 gmx_incons("gmx_pme_calc_energy called in parallel");
2335 if (pme->bFEP > 1)
2337 gmx_incons("gmx_pme_calc_energy with free energy");
2340 atc = &pme->atc_energy;
2341 atc->nslab = 1;
2342 atc->bSpread = TRUE;
2343 atc->pme_order = pme->pme_order;
2344 atc->n = n;
2345 pme_realloc_atomcomm_things(atc);
2346 atc->x = x;
2347 atc->q = q;
2349 /* We only use the A-charges grid */
2350 grid = pme->pmegridA;
2352 spread_on_grid(pme,atc,NULL,TRUE,FALSE);
2354 *V = gather_energy_bsplines(pme,grid,atc);
2358 static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
2359 t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel)
2361 /* Reset all the counters related to performance over the run */
2362 wallcycle_stop(wcycle,ewcRUN);
2363 wallcycle_reset_all(wcycle);
2364 init_nrnb(nrnb);
2365 ir->init_step += step_rel;
2366 ir->nsteps -= step_rel;
2367 wallcycle_start(wcycle,ewcRUN);
2371 int gmx_pmeonly(gmx_pme_t pme,
2372 t_commrec *cr, t_nrnb *nrnb,
2373 gmx_wallcycle_t wcycle,
2374 real ewaldcoeff, gmx_bool bGatherOnly,
2375 t_inputrec *ir)
2377 gmx_pme_pp_t pme_pp;
2378 int natoms;
2379 matrix box;
2380 rvec *x_pp=NULL,*f_pp=NULL;
2381 real *chargeA=NULL,*chargeB=NULL;
2382 real lambda=0;
2383 int maxshift_x=0,maxshift_y=0;
2384 real energy,dvdlambda;
2385 matrix vir;
2386 float cycles;
2387 int count;
2388 gmx_bool bEnerVir;
2389 gmx_large_int_t step,step_rel;
2392 pme_pp = gmx_pme_pp_init(cr);
2394 init_nrnb(nrnb);
2396 count = 0;
2397 do /****** this is a quasi-loop over time steps! */
2399 /* Domain decomposition */
2400 natoms = gmx_pme_recv_q_x(pme_pp,
2401 &chargeA,&chargeB,box,&x_pp,&f_pp,
2402 &maxshift_x,&maxshift_y,
2403 &pme->bFEP,&lambda,
2404 &bEnerVir,
2405 &step);
2407 if (natoms == -1) {
2408 /* We should stop: break out of the loop */
2409 break;
2412 step_rel = step - ir->init_step;
2414 if (count == 0)
2415 wallcycle_start(wcycle,ewcRUN);
2417 wallcycle_start(wcycle,ewcPMEMESH);
2419 dvdlambda = 0;
2420 clear_mat(vir);
2421 gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
2422 cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
2423 &energy,lambda,&dvdlambda,
2424 GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
2426 cycles = wallcycle_stop(wcycle,ewcPMEMESH);
2428 gmx_pme_send_force_vir_ener(pme_pp,
2429 f_pp,vir,energy,dvdlambda,
2430 cycles);
2432 count++;
2434 if (step_rel == wcycle_get_reset_counters(wcycle))
2436 /* Reset all the counters related to performance over the run */
2437 reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
2438 wcycle_set_reset_counters(wcycle, 0);
2441 } /***** end of quasi-loop, we stop with the break above */
2442 while (TRUE);
2444 return 0;
2447 int gmx_pme_do(gmx_pme_t pme,
2448 int start, int homenr,
2449 rvec x[], rvec f[],
2450 real *chargeA, real *chargeB,
2451 matrix box, t_commrec *cr,
2452 int maxshift_x, int maxshift_y,
2453 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
2454 matrix vir, real ewaldcoeff,
2455 real *energy, real lambda,
2456 real *dvdlambda, int flags)
2458 int q,d,i,j,ntot,npme;
2459 int nx,ny,nz;
2460 int n_d,local_ny;
2461 int loop_count;
2462 pme_atomcomm_t *atc=NULL;
2463 real * grid=NULL;
2464 real *ptr;
2465 rvec *x_d,*f_d;
2466 real *charge=NULL,*q_d,vol;
2467 real energy_AB[2];
2468 matrix vir_AB[2];
2469 gmx_bool bClearF;
2470 gmx_parallel_3dfft_t pfft_setup;
2471 real * fftgrid;
2472 t_complex * cfftgrid;
2474 if (pme->nnodes > 1) {
2475 atc = &pme->atc[0];
2476 atc->npd = homenr;
2477 if (atc->npd > atc->pd_nalloc) {
2478 atc->pd_nalloc = over_alloc_dd(atc->npd);
2479 srenew(atc->pd,atc->pd_nalloc);
2481 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
2483 else
2485 /* This could be necessary for TPI */
2486 pme->atc[0].n = homenr;
2489 for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
2490 if (q == 0) {
2491 grid = pme->pmegridA;
2492 fftgrid = pme->fftgridA;
2493 cfftgrid = pme->cfftgridA;
2494 pfft_setup = pme->pfft_setupA;
2495 charge = chargeA+start;
2496 } else {
2497 grid = pme->pmegridB;
2498 fftgrid = pme->fftgridB;
2499 cfftgrid = pme->cfftgridB;
2500 pfft_setup = pme->pfft_setupB;
2501 charge = chargeB+start;
2503 /* Unpack structure */
2504 if (debug) {
2505 fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
2506 cr->nnodes,cr->nodeid);
2507 fprintf(debug,"Grid = %p\n",(void*)grid);
2508 if (grid == NULL)
2509 gmx_fatal(FARGS,"No grid!");
2511 where();
2513 m_inv_ur0(box,pme->recipbox);
2515 if (pme->nnodes == 1) {
2516 atc = &pme->atc[0];
2517 if (DOMAINDECOMP(cr)) {
2518 atc->n = homenr;
2519 pme_realloc_atomcomm_things(atc);
2521 atc->x = x;
2522 atc->q = charge;
2523 atc->f = f;
2524 } else {
2525 wallcycle_start(wcycle,ewcPME_REDISTXF);
2526 for(d=pme->ndecompdim-1; d>=0; d--)
2528 if (d == pme->ndecompdim-1)
2530 n_d = homenr;
2531 x_d = x + start;
2532 q_d = charge;
2534 else
2536 n_d = pme->atc[d+1].n;
2537 x_d = atc->x;
2538 q_d = atc->q;
2540 atc = &pme->atc[d];
2541 atc->npd = n_d;
2542 if (atc->npd > atc->pd_nalloc) {
2543 atc->pd_nalloc = over_alloc_dd(atc->npd);
2544 srenew(atc->pd,atc->pd_nalloc);
2546 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
2547 pme_calc_pidx(n_d,pme->recipbox,x_d,atc);
2548 where();
2550 GMX_BARRIER(cr->mpi_comm_mygroup);
2551 /* Redistribute x (only once) and qA or qB */
2552 if (DOMAINDECOMP(cr)) {
2553 dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc);
2554 } else {
2555 pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc);
2558 where();
2560 wallcycle_stop(wcycle,ewcPME_REDISTXF);
2563 if (debug)
2564 fprintf(debug,"Node= %6d, pme local particles=%6d\n",
2565 cr->nodeid,atc->n);
2567 if (flags & GMX_PME_SPREAD_Q)
2569 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
2571 /* Spread the charges on a grid */
2572 GMX_MPE_LOG(ev_spread_on_grid_start);
2574 /* Spread the charges on a grid */
2575 spread_on_grid(pme,&pme->atc[0],grid,q==0,TRUE);
2576 GMX_MPE_LOG(ev_spread_on_grid_finish);
2578 if (q == 0)
2580 inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n);
2582 inc_nrnb(nrnb,eNR_SPREADQBSP,
2583 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
2585 wrap_periodic_pmegrid(pme,grid);
2587 /* sum contributions to local grid from other nodes */
2588 #ifdef GMX_MPI
2589 if (pme->nnodes > 1) {
2590 GMX_BARRIER(cr->mpi_comm_mygroup);
2591 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
2592 where();
2594 #endif
2595 where();
2597 copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
2599 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
2602 if (flags & GMX_PME_SOLVE)
2604 /* do 3d-fft */
2605 GMX_BARRIER(cr->mpi_comm_mygroup);
2606 GMX_MPE_LOG(ev_gmxfft3d_start);
2607 wallcycle_start(wcycle,ewcPME_FFT);
2608 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,fftgrid,cfftgrid);
2609 wallcycle_stop(wcycle,ewcPME_FFT);
2610 GMX_MPE_LOG(ev_gmxfft3d_finish);
2611 where();
2613 /* solve in k-space for our local cells */
2614 vol = det(box);
2615 GMX_BARRIER(cr->mpi_comm_mygroup);
2616 GMX_MPE_LOG(ev_solve_pme_start);
2617 wallcycle_start(wcycle,ewcPME_SOLVE);
2618 loop_count =
2619 solve_pme_yzx(pme,cfftgrid,ewaldcoeff,vol,
2620 flags & GMX_PME_CALC_ENER_VIR,
2621 &energy_AB[q],vir_AB[q]);
2622 wallcycle_stop(wcycle,ewcPME_SOLVE);
2623 where();
2624 GMX_MPE_LOG(ev_solve_pme_finish);
2625 inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
2628 if ((flags & GMX_PME_CALC_F) ||
2629 (flags & GMX_PME_CALC_POT))
2632 /* do 3d-invfft */
2633 GMX_BARRIER(cr->mpi_comm_mygroup);
2634 GMX_MPE_LOG(ev_gmxfft3d_start);
2635 where();
2636 wallcycle_start(wcycle,ewcPME_FFT);
2637 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,cfftgrid,fftgrid);
2638 wallcycle_stop(wcycle,ewcPME_FFT);
2640 where();
2641 GMX_MPE_LOG(ev_gmxfft3d_finish);
2643 if (pme->nodeid == 0)
2645 ntot = pme->nkx*pme->nky*pme->nkz;
2646 npme = ntot*log((real)ntot)/log(2.0);
2647 inc_nrnb(nrnb,eNR_FFT,2*npme);
2650 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
2652 copy_fftgrid_to_pmegrid(pme,fftgrid,grid);
2654 /* distribute local grid to all nodes */
2655 #ifdef GMX_MPI
2656 if (pme->nnodes > 1) {
2657 GMX_BARRIER(cr->mpi_comm_mygroup);
2658 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD);
2660 #endif
2661 where();
2663 unwrap_periodic_pmegrid(pme,grid);
2666 if (flags & GMX_PME_CALC_F)
2668 /* interpolate forces for our local atoms */
2669 GMX_BARRIER(cr->mpi_comm_mygroup);
2670 GMX_MPE_LOG(ev_gather_f_bsplines_start);
2672 where();
2674 /* If we are running without parallelization,
2675 * atc->f is the actual force array, not a buffer,
2676 * therefore we should not clear it.
2678 bClearF = (q == 0 && PAR(cr));
2679 gather_f_bsplines(pme,grid,bClearF,&pme->atc[0],
2680 pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
2681 where();
2683 GMX_MPE_LOG(ev_gather_f_bsplines_finish);
2685 inc_nrnb(nrnb,eNR_GATHERFBSP,
2686 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
2687 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
2689 } /* of q-loop */
2691 if ((flags & GMX_PME_CALC_F) && pme->nnodes > 1) {
2692 wallcycle_start(wcycle,ewcPME_REDISTXF);
2693 for(d=0; d<pme->ndecompdim; d++)
2695 atc = &pme->atc[d];
2696 if (d == pme->ndecompdim - 1)
2698 n_d = homenr;
2699 f_d = f + start;
2701 else
2703 n_d = pme->atc[d+1].n;
2704 f_d = pme->atc[d+1].f;
2706 GMX_BARRIER(cr->mpi_comm_mygroup);
2707 if (DOMAINDECOMP(cr)) {
2708 dd_pmeredist_f(pme,atc,n_d,f_d,
2709 d==pme->ndecompdim-1 && pme->bPPnode);
2710 } else {
2711 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
2715 wallcycle_stop(wcycle,ewcPME_REDISTXF);
2717 where();
2719 if (!pme->bFEP) {
2720 *energy = energy_AB[0];
2721 m_add(vir,vir_AB[0],vir);
2722 } else {
2723 *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
2724 *dvdlambda += energy_AB[1] - energy_AB[0];
2725 for(i=0; i<DIM; i++)
2726 for(j=0; j<DIM; j++)
2727 vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] + lambda*vir_AB[1][i][j];
2730 if (debug)
2731 fprintf(debug,"PME mesh energy: %g\n",*energy);
2733 return 0;