Fixed include statements such that double precision version of genborn.c
[gromacs/rigid-bodies.git] / src / mdlib / pme.c
blobd7f527d1ceaca60f78eb8793c69511d0e40bbd3f
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 *
4 * This source code is part of
5 *
6 * G R O M A C S
7 *
8 * GROningen MAchine for Chemical Simulations
9 *
10 * VERSION 3.2.0
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
33 * And Hey:
34 * GROwing Monsters And Cloning Shrimps
36 /* IMPORTANT FOR DEVELOPERS:
38 * Triclinic pme stuff isn't entirely trivial, and we've experienced
39 * some bugs during development (many of them due to me). To avoid
40 * this in the future, please check the following things if you make
41 * changes in this file:
43 * 1. You should obtain identical (at least to the PME precision)
44 * energies, forces, and virial for
45 * a rectangular box and a triclinic one where the z (or y) axis is
46 * tilted a whole box side. For instance you could use these boxes:
48 * rectangular triclinic
49 * 2 0 0 2 0 0
50 * 0 2 0 0 2 0
51 * 0 0 6 2 2 6
53 * 2. You should check the energy conservation in a triclinic box.
55 * It might seem an overkill, but better safe than sorry.
56 * /Erik 001109
57 */
59 #ifdef HAVE_CONFIG_H
60 #include <config.h>
61 #endif
63 #ifdef GMX_LIB_MPI
64 #include <mpi.h>
65 #endif
66 #ifdef GMX_THREADS
67 #include "tmpi.h"
68 #endif
71 #include <stdio.h>
72 #include <string.h>
73 #include <math.h>
74 #include "typedefs.h"
75 #include "txtdump.h"
76 #include "vec.h"
77 #include "gmxcomplex.h"
78 #include "smalloc.h"
79 #include "futil.h"
80 #include "coulomb.h"
81 #include "gmx_fatal.h"
82 #include "pme.h"
83 #include "network.h"
84 #include "physics.h"
85 #include "nrnb.h"
86 #include "copyrite.h"
87 #include "gmx_wallcycle.h"
88 #include "gmx_parallel_3dfft.h"
89 #include "pdbio.h"
91 #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
92 #include "gmx_sse2_single.h"
93 #endif
95 #include "mpelogging.h"
97 #define DFT_TOL 1e-7
98 /* #define PRT_FORCE */
99 /* conditions for on the fly time-measurement */
100 /* #define TAKETIME (step > 1 && timesteps < 10) */
101 #define TAKETIME FALSE
103 #ifdef GMX_DOUBLE
104 #define mpi_type MPI_DOUBLE
105 #else
106 #define mpi_type MPI_FLOAT
107 #endif
109 /* Internal datastructures */
110 typedef struct {
111 int send_index0;
112 int send_nindex;
113 int recv_index0;
114 int recv_nindex;
115 } pme_grid_comm_t;
117 typedef struct {
118 #ifdef GMX_MPI
119 MPI_Comm mpi_comm;
120 #endif
121 int nnodes,nodeid;
122 int ndata;
123 int *s2g0;
124 int *s2g1;
125 int noverlap_nodes;
126 int *send_id,*recv_id;
127 pme_grid_comm_t *comm_data;
128 } pme_overlap_t;
130 typedef struct {
131 int dimind; /* The index of the dimension, 0=x, 1=y */
132 int nslab;
133 int nodeid;
134 #ifdef GMX_MPI
135 MPI_Comm mpi_comm;
136 #endif
138 int *node_dest; /* The nodes to send x and q to with DD */
139 int *node_src; /* The nodes to receive x and q from with DD */
140 int *buf_index; /* Index for commnode into the buffers */
142 int maxshift;
144 int npd;
145 int pd_nalloc;
146 int *pd;
147 int *count; /* The number of atoms to send to each node */
148 int *rcount; /* The number of atoms to receive */
150 int n;
151 int nalloc;
152 rvec *x;
153 real *q;
154 rvec *f;
155 bool bSpread; /* These coordinates are used for spreading */
156 int pme_order;
157 splinevec theta,dtheta;
158 ivec *idx;
159 rvec *fractx; /* Fractional coordinate relative to the
160 * lower cell boundary
162 } pme_atomcomm_t;
164 typedef struct gmx_pme {
165 int ndecompdim; /* The number of decomposition dimensions */
166 int nodeid; /* Our nodeid in mpi->mpi_comm */
167 int nodeid_major;
168 int nodeid_minor;
169 int nnodes; /* The number of nodes doing PME */
170 int nnodes_major;
171 int nnodes_minor;
173 MPI_Comm mpi_comm;
174 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
175 #ifdef GMX_MPI
176 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
177 #endif
179 bool bPPnode; /* Node also does particle-particle forces */
180 bool bFEP; /* Compute Free energy contribution */
181 int nkx,nky,nkz; /* Grid dimensions */
182 int pme_order;
183 real epsilon_r;
185 real * pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
186 real * pmegridB;
187 int pmegrid_nx,pmegrid_ny,pmegrid_nz;
188 int pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
190 real * pmegrid_sendbuf;
191 real * pmegrid_recvbuf;
193 real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
194 real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
195 int fftgrid_nx,fftgrid_ny,fftgrid_nz;
197 t_complex *cfftgridA; /* Grids for complex FFT data */
198 t_complex *cfftgridB;
199 int cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
201 gmx_parallel_3dfft_t pfft_setupA;
202 gmx_parallel_3dfft_t pfft_setupB;
204 int *nnx,*nny,*nnz;
205 real *fshx,*fshy,*fshz;
207 pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
208 matrix recipbox;
209 splinevec bsp_mod;
211 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
214 pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
216 rvec *bufv; /* Communication buffer */
217 real *bufr; /* Communication buffer */
218 int buf_nalloc; /* The communication buffer size */
220 /* work data for solve_pme */
221 int work_nalloc;
222 real * work_mhx;
223 real * work_mhy;
224 real * work_mhz;
225 real * work_m2;
226 real * work_denom;
227 real * work_tmp1_alloc;
228 real * work_tmp1;
229 real * work_m2inv;
231 /* Work data for PME_redist */
232 bool redist_init;
233 int * scounts;
234 int * rcounts;
235 int * sdispls;
236 int * rdispls;
237 int * sidx;
238 int * idxa;
239 real * redist_buf;
240 int redist_buf_nalloc;
242 /* Work data for sum_qgrid */
243 real * sum_qgrid_tmp;
244 real * sum_qgrid_dd_tmp;
245 } t_gmx_pme;
248 static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc)
250 int i;
251 int *idxptr,tix,tiy,tiz;
252 real *xptr,*fptr,tx,ty,tz;
253 real rxx,ryx,ryy,rzx,rzy,rzz;
254 int nx,ny,nz;
255 int start_ix,start_iy,start_iz;
257 nx = pme->nkx;
258 ny = pme->nky;
259 nz = pme->nkz;
261 start_ix = pme->pmegrid_start_ix;
262 start_iy = pme->pmegrid_start_iy;
263 start_iz = pme->pmegrid_start_iz;
265 rxx = pme->recipbox[XX][XX];
266 ryx = pme->recipbox[YY][XX];
267 ryy = pme->recipbox[YY][YY];
268 rzx = pme->recipbox[ZZ][XX];
269 rzy = pme->recipbox[ZZ][YY];
270 rzz = pme->recipbox[ZZ][ZZ];
272 for(i=0; (i<atc->n); i++) {
273 xptr = atc->x[i];
274 idxptr = atc->idx[i];
275 fptr = atc->fractx[i];
277 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
278 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
279 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
280 tz = nz * ( xptr[ZZ] * rzz + 2.0 );
282 tix = (int)(tx);
283 tiy = (int)(ty);
284 tiz = (int)(tz);
286 /* Because decomposition only occurs in x and y,
287 * we never have a fraction correction in z.
289 fptr[XX] = tx - tix + pme->fshx[tix];
290 fptr[YY] = ty - tiy + pme->fshy[tiy];
291 fptr[ZZ] = tz - tiz;
293 idxptr[XX] = pme->nnx[tix];
294 idxptr[YY] = pme->nny[tiy];
295 idxptr[ZZ] = pme->nnz[tiz];
297 #ifdef DEBUG
298 range_check(idxptr[XX],0,pme->pmegrid_nx);
299 range_check(idxptr[YY],0,pme->pmegrid_ny);
300 range_check(idxptr[ZZ],0,pme->pmegrid_nz);
301 #endif
305 static void pme_calc_pidx(int natoms, matrix recipbox, rvec x[],
306 pme_atomcomm_t *atc)
308 int nslab,i;
309 int si;
310 real *xptr,s;
311 real rxx,ryx,rzx,ryy,rzy;
312 int *pd,*count;
314 /* Calculate PME task index (pidx) for each grid index.
315 * Here we always assign equally sized slabs to each node
316 * for load balancing reasons (the PME grid spacing is not used).
319 nslab = atc->nslab;
320 pd = atc->pd;
321 count = atc->count;
323 /* Reset the count */
324 for(i=0; i<nslab; i++)
326 count[i] = 0;
329 if (atc->dimind == 0)
331 rxx = recipbox[XX][XX];
332 ryx = recipbox[YY][XX];
333 rzx = recipbox[ZZ][XX];
334 /* Calculate the node index in x-dimension */
335 for(i=0; (i<natoms); i++)
337 xptr = x[i];
338 /* Fractional coordinates along box vectors */
339 s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
340 si = (int)(s + 2*nslab) % nslab;
341 pd[i] = si;
342 count[si]++;
345 else
347 ryy = recipbox[YY][YY];
348 rzy = recipbox[ZZ][YY];
349 /* Calculate the node index in y-dimension */
350 for(i=0; (i<natoms); i++)
352 xptr = x[i];
353 /* Fractional coordinates along box vectors */
354 s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
355 si = (int)(s + 2*nslab) % nslab;
356 pd[i] = si;
357 count[si]++;
362 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
364 int nalloc_old,i;
366 if (atc->n > atc->nalloc) {
367 nalloc_old = atc->nalloc;
368 atc->nalloc = over_alloc_dd(atc->n);
370 if (atc->nslab > 1) {
371 srenew(atc->x,atc->nalloc);
372 srenew(atc->q,atc->nalloc);
373 srenew(atc->f,atc->nalloc);
374 for(i=nalloc_old; i<atc->nalloc; i++)
376 clear_rvec(atc->f[i]);
379 if (atc->bSpread) {
380 for(i=0;i<DIM;i++) {
381 srenew(atc->theta[i] ,atc->pme_order*atc->nalloc);
382 srenew(atc->dtheta[i],atc->pme_order*atc->nalloc);
384 srenew(atc->fractx,atc->nalloc);
385 srenew(atc->idx ,atc->nalloc);
390 static void pmeredist_pd(gmx_pme_t pme, bool forw,
391 int n, bool bXF, rvec *x_f, real *charge,
392 pme_atomcomm_t *atc)
393 /* Redistribute particle data for PME calculation */
394 /* domain decomposition by x coordinate */
396 int *idxa;
397 int i, ii;
399 if(FALSE == pme->redist_init) {
400 snew(pme->scounts,atc->nslab);
401 snew(pme->rcounts,atc->nslab);
402 snew(pme->sdispls,atc->nslab);
403 snew(pme->rdispls,atc->nslab);
404 snew(pme->sidx,atc->nslab);
405 pme->redist_init = TRUE;
407 if (n > pme->redist_buf_nalloc) {
408 pme->redist_buf_nalloc = over_alloc_dd(n);
409 srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
412 pme->idxa = atc->pd;
414 #ifdef GMX_MPI
415 if (forw && bXF) {
416 /* forward, redistribution from pp to pme */
418 /* Calculate send counts and exchange them with other nodes */
419 for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
420 for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
421 MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
423 /* Calculate send and receive displacements and index into send
424 buffer */
425 pme->sdispls[0]=0;
426 pme->rdispls[0]=0;
427 pme->sidx[0]=0;
428 for(i=1; i<atc->nslab; i++) {
429 pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1];
430 pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1];
431 pme->sidx[i]=pme->sdispls[i];
433 /* Total # of particles to be received */
434 atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
436 pme_realloc_atomcomm_things(atc);
438 /* Copy particle coordinates into send buffer and exchange*/
439 for(i=0; (i<n); i++) {
440 ii=DIM*pme->sidx[pme->idxa[i]];
441 pme->sidx[pme->idxa[i]]++;
442 pme->redist_buf[ii+XX]=x_f[i][XX];
443 pme->redist_buf[ii+YY]=x_f[i][YY];
444 pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
446 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
447 pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
448 pme->rvec_mpi, atc->mpi_comm);
450 if (forw) {
451 /* Copy charge into send buffer and exchange*/
452 for(i=0; i<atc->nslab; i++) pme->sidx[i]=pme->sdispls[i];
453 for(i=0; (i<n); i++) {
454 ii=pme->sidx[pme->idxa[i]];
455 pme->sidx[pme->idxa[i]]++;
456 pme->redist_buf[ii]=charge[i];
458 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
459 atc->q, pme->rcounts, pme->rdispls, mpi_type,
460 atc->mpi_comm);
462 else { /* backward, redistribution from pme to pp */
463 MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
464 pme->redist_buf, pme->scounts, pme->sdispls,
465 pme->rvec_mpi, atc->mpi_comm);
467 /* Copy data from receive buffer */
468 for(i=0; i<atc->nslab; i++)
469 pme->sidx[i] = pme->sdispls[i];
470 for(i=0; (i<n); i++) {
471 ii = DIM*pme->sidx[pme->idxa[i]];
472 x_f[i][XX] += pme->redist_buf[ii+XX];
473 x_f[i][YY] += pme->redist_buf[ii+YY];
474 x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
475 pme->sidx[pme->idxa[i]]++;
478 #endif
481 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
482 bool bBackward,int shift,
483 void *buf_s,int nbyte_s,
484 void *buf_r,int nbyte_r)
486 #ifdef GMX_MPI
487 int dest,src;
488 MPI_Status stat;
490 if (bBackward == FALSE) {
491 dest = atc->node_dest[shift];
492 src = atc->node_src[shift];
493 } else {
494 dest = atc->node_src[shift];
495 src = atc->node_dest[shift];
498 if (nbyte_s > 0 && nbyte_r > 0) {
499 MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
500 dest,shift,
501 buf_r,nbyte_r,MPI_BYTE,
502 src,shift,
503 atc->mpi_comm,&stat);
504 } else if (nbyte_s > 0) {
505 MPI_Send(buf_s,nbyte_s,MPI_BYTE,
506 dest,shift,
507 atc->mpi_comm);
508 } else if (nbyte_r > 0) {
509 MPI_Recv(buf_r,nbyte_r,MPI_BYTE,
510 src,shift,
511 atc->mpi_comm,&stat);
513 #endif
516 static void dd_pmeredist_x_q(gmx_pme_t pme,
517 int n, bool bX, rvec *x, real *charge,
518 pme_atomcomm_t *atc)
520 int *commnode,*buf_index;
521 int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
523 commnode = atc->node_dest;
524 buf_index = atc->buf_index;
526 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
528 nsend = 0;
529 for(i=0; i<nnodes_comm; i++) {
530 buf_index[commnode[i]] = nsend;
531 nsend += atc->count[commnode[i]];
533 if (bX) {
534 if (atc->count[atc->nodeid] + nsend != n)
535 gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
536 "This usually means that your system is not well equilibrated.",
537 n - (atc->count[atc->nodeid] + nsend),
538 pme->nodeid,'x'+atc->dimind);
540 if (nsend > pme->buf_nalloc) {
541 pme->buf_nalloc = over_alloc_dd(nsend);
542 srenew(pme->bufv,pme->buf_nalloc);
543 srenew(pme->bufr,pme->buf_nalloc);
546 atc->n = atc->count[atc->nodeid];
547 for(i=0; i<nnodes_comm; i++) {
548 scount = atc->count[commnode[i]];
549 /* Communicate the count */
550 if (debug)
551 fprintf(debug,"dimind %d PME node %d send to node %d: %d\n",
552 atc->dimind,atc->nodeid,commnode[i],scount);
553 pme_dd_sendrecv(atc,FALSE,i,
554 &scount,sizeof(int),
555 &atc->rcount[i],sizeof(int));
556 atc->n += atc->rcount[i];
559 pme_realloc_atomcomm_things(atc);
562 local_pos = 0;
563 for(i=0; i<n; i++) {
564 node = atc->pd[i];
565 if (node == atc->nodeid) {
566 /* Copy direct to the receive buffer */
567 if (bX) {
568 copy_rvec(x[i],atc->x[local_pos]);
570 atc->q[local_pos] = charge[i];
571 local_pos++;
572 } else {
573 /* Copy to the send buffer */
574 if (bX) {
575 copy_rvec(x[i],pme->bufv[buf_index[node]]);
577 pme->bufr[buf_index[node]] = charge[i];
578 buf_index[node]++;
582 buf_pos = 0;
583 for(i=0; i<nnodes_comm; i++) {
584 scount = atc->count[commnode[i]];
585 rcount = atc->rcount[i];
586 if (scount > 0 || rcount > 0) {
587 if (bX) {
588 /* Communicate the coordinates */
589 pme_dd_sendrecv(atc,FALSE,i,
590 pme->bufv[buf_pos],scount*sizeof(rvec),
591 atc->x[local_pos],rcount*sizeof(rvec));
593 /* Communicate the charges */
594 pme_dd_sendrecv(atc,FALSE,i,
595 pme->bufr+buf_pos,scount*sizeof(real),
596 atc->q+local_pos,rcount*sizeof(real));
597 buf_pos += scount;
598 local_pos += atc->rcount[i];
603 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
604 int n, rvec *f,
605 bool bAddF)
607 int *commnode,*buf_index;
608 int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node;
610 commnode = atc->node_dest;
611 buf_index = atc->buf_index;
613 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
615 local_pos = atc->count[atc->nodeid];
616 buf_pos = 0;
617 for(i=0; i<nnodes_comm; i++) {
618 scount = atc->rcount[i];
619 rcount = atc->count[commnode[i]];
620 if (scount > 0 || rcount > 0) {
621 /* Communicate the forces */
622 pme_dd_sendrecv(atc,TRUE,i,
623 atc->f[local_pos],scount*sizeof(rvec),
624 pme->bufv[buf_pos],rcount*sizeof(rvec));
625 local_pos += scount;
627 buf_index[commnode[i]] = buf_pos;
628 buf_pos += rcount;
631 local_pos = 0;
632 if (bAddF)
634 for(i=0; i<n; i++)
636 node = atc->pd[i];
637 if (node == atc->nodeid)
639 /* Add from the local force array */
640 rvec_inc(f[i],atc->f[local_pos]);
641 local_pos++;
643 else
645 /* Add from the receive buffer */
646 rvec_inc(f[i],pme->bufv[buf_index[node]]);
647 buf_index[node]++;
651 else
653 for(i=0; i<n; i++)
655 node = atc->pd[i];
656 if (node == atc->nodeid)
658 /* Copy from the local force array */
659 copy_rvec(atc->f[local_pos],f[i]);
660 local_pos++;
662 else
664 /* Copy from the receive buffer */
665 copy_rvec(pme->bufv[buf_index[node]],f[i]);
666 buf_index[node]++;
672 #ifdef GMX_MPI
673 static void
674 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
676 pme_overlap_t *overlap;
677 int send_index0,send_nindex;
678 int recv_index0,recv_nindex;
679 MPI_Status stat;
680 int i,j,k,ix,iy,iz,icnt;
681 int ipulse,send_id,recv_id,datasize;
682 real *p;
683 real *sendptr,*recvptr;
685 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
686 overlap = &pme->overlap[1];
688 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
690 /* Since we have already (un)wrapped the overlap in the z-dimension,
691 * we only have to communicate 0 to nkz (not pmegrid_nz).
693 if (direction==GMX_SUM_QGRID_FORWARD)
695 send_id = overlap->send_id[ipulse];
696 recv_id = overlap->recv_id[ipulse];
697 send_index0 = overlap->comm_data[ipulse].send_index0;
698 send_nindex = overlap->comm_data[ipulse].send_nindex;
699 recv_index0 = overlap->comm_data[ipulse].recv_index0;
700 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
702 else
704 send_id = overlap->recv_id[ipulse];
705 recv_id = overlap->send_id[ipulse];
706 send_index0 = overlap->comm_data[ipulse].recv_index0;
707 send_nindex = overlap->comm_data[ipulse].recv_nindex;
708 recv_index0 = overlap->comm_data[ipulse].send_index0;
709 recv_nindex = overlap->comm_data[ipulse].send_nindex;
712 /* Copy data to contiguous send buffer */
713 if (debug)
715 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
716 pme->nodeid,overlap->nodeid,send_id,
717 pme->pmegrid_start_iy,
718 send_index0-pme->pmegrid_start_iy,
719 send_index0-pme->pmegrid_start_iy+send_nindex);
721 icnt = 0;
722 for(i=0;i<pme->pmegrid_nx;i++)
724 ix = i;
725 for(j=0;j<send_nindex;j++)
727 iy = j + send_index0 - pme->pmegrid_start_iy;
728 for(k=0;k<pme->nkz;k++)
730 iz = k;
731 pme->pmegrid_sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
736 datasize = pme->pmegrid_nx * pme->nkz;
738 MPI_Sendrecv(pme->pmegrid_sendbuf,send_nindex*datasize,GMX_MPI_REAL,
739 send_id,ipulse,
740 pme->pmegrid_recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
741 recv_id,ipulse,
742 overlap->mpi_comm,&stat);
744 /* Get data from contiguous recv buffer */
745 if (debug)
747 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
748 pme->nodeid,overlap->nodeid,recv_id,
749 pme->pmegrid_start_iy,
750 recv_index0-pme->pmegrid_start_iy,
751 recv_index0-pme->pmegrid_start_iy+recv_nindex);
753 icnt = 0;
754 for(i=0;i<pme->pmegrid_nx;i++)
756 ix = i;
757 for(j=0;j<recv_nindex;j++)
759 iy = j + recv_index0 - pme->pmegrid_start_iy;
760 for(k=0;k<pme->nkz;k++)
762 iz = k;
763 if(direction==GMX_SUM_QGRID_FORWARD)
765 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += pme->pmegrid_recvbuf[icnt++];
767 else
769 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = pme->pmegrid_recvbuf[icnt++];
776 /* Major dimension is easier, no copying required,
777 * but we might have to sum to separate array.
778 * Since we don't copy, we have to communicate up to pmegrid_nz,
779 * not nkz as for the minor direction.
781 overlap = &pme->overlap[0];
783 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
785 if(direction==GMX_SUM_QGRID_FORWARD)
787 send_id = overlap->send_id[ipulse];
788 recv_id = overlap->recv_id[ipulse];
789 send_index0 = overlap->comm_data[ipulse].send_index0;
790 send_nindex = overlap->comm_data[ipulse].send_nindex;
791 recv_index0 = overlap->comm_data[ipulse].recv_index0;
792 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
793 recvptr = pme->pmegrid_recvbuf;
795 else
797 send_id = overlap->recv_id[ipulse];
798 recv_id = overlap->send_id[ipulse];
799 send_index0 = overlap->comm_data[ipulse].recv_index0;
800 send_nindex = overlap->comm_data[ipulse].recv_nindex;
801 recv_index0 = overlap->comm_data[ipulse].send_index0;
802 recv_nindex = overlap->comm_data[ipulse].send_nindex;
803 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
806 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
807 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
809 if (debug)
811 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
812 pme->nodeid,overlap->nodeid,send_id,
813 pme->pmegrid_start_ix,
814 send_index0-pme->pmegrid_start_ix,
815 send_index0-pme->pmegrid_start_ix+send_nindex);
816 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
817 pme->nodeid,overlap->nodeid,recv_id,
818 pme->pmegrid_start_ix,
819 recv_index0-pme->pmegrid_start_ix,
820 recv_index0-pme->pmegrid_start_ix+recv_nindex);
823 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
824 send_id,ipulse,
825 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
826 recv_id,ipulse,
827 overlap->mpi_comm,&stat);
829 /* ADD data from contiguous recv buffer */
830 if(direction==GMX_SUM_QGRID_FORWARD)
832 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
833 for(i=0;i<recv_nindex*datasize;i++)
835 p[i] += pme->pmegrid_recvbuf[i];
840 #endif
843 static int
844 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
846 ivec local_fft_ndata,local_fft_offset,local_fft_size;
847 ivec local_pme_size;
848 int i,ix,iy,iz;
849 int pmeidx,fftidx;
851 /* Dimensions should be identical for A/B grid, so we just use A here */
852 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
853 local_fft_ndata,
854 local_fft_offset,
855 local_fft_size);
857 local_pme_size[0] = pme->pmegrid_nx;
858 local_pme_size[1] = pme->pmegrid_ny;
859 local_pme_size[2] = pme->pmegrid_nz;
861 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
862 the offset is identical, and the PME grid always has more data (due to overlap)
865 #ifdef DEBUG_PME
866 FILE *fp,*fp2;
867 char fn[STRLEN],format[STRLEN];
868 real val;
869 sprintf(fn,"pmegrid%d.pdb",pme->nodeid);
870 fp = ffopen(fn,"w");
871 sprintf(fn,"pmegrid%d.txt",pme->nodeid);
872 fp2 = ffopen(fn,"w");
873 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
874 #endif
875 for(ix=0;ix<local_fft_ndata[XX];ix++)
877 for(iy=0;iy<local_fft_ndata[YY];iy++)
879 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
881 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
882 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
883 fftgrid[fftidx] = pmegrid[pmeidx];
884 #ifdef DEBUG_PME
885 val = 100*pmegrid[pmeidx];
886 if (pmegrid[pmeidx] != 0)
887 fprintf(fp,format,"ATOM",pmeidx,"CA","GLY",' ',pmeidx,' ',
888 5.0*ix,5.0*iy,5.0*iz,1.0,val);
889 if (pmegrid[pmeidx] != 0)
890 fprintf(fp2,"%-12s %5d %5d %5d %12.5e\n",
891 "qgrid",
892 pme->pmegrid_start_ix + ix,
893 pme->pmegrid_start_iy + iy,
894 pme->pmegrid_start_iz + iz,
895 pmegrid[pmeidx]);
896 #endif
900 #ifdef DEBUG_PME
901 fclose(fp);
902 fclose(fp2);
903 #endif
905 return 0;
909 static int
910 copy_fftgrid_to_pmegrid(gmx_pme_t pme, real *fftgrid, real *pmegrid)
912 ivec local_fft_ndata,local_fft_offset,local_fft_size;
913 ivec local_pme_size;
914 int i,ix,iy,iz;
915 int pmeidx,fftidx;
917 /* Dimensions should be identical for A/B grid, so we just use A here */
918 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
919 local_fft_ndata,
920 local_fft_offset,
921 local_fft_size);
923 local_pme_size[0] = pme->pmegrid_nx;
924 local_pme_size[1] = pme->pmegrid_ny;
925 local_pme_size[2] = pme->pmegrid_nz;
927 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
928 the offset is identical, and the PME grid always has more data (due to overlap)
930 for(ix=0;ix<local_fft_ndata[XX];ix++)
932 for(iy=0;iy<local_fft_ndata[YY];iy++)
934 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
936 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
937 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
938 pmegrid[pmeidx] = fftgrid[fftidx];
942 return 0;
946 static void
947 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
949 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
951 nx = pme->nkx;
952 ny = pme->nky;
953 nz = pme->nkz;
955 pnx = pme->pmegrid_nx;
956 pny = pme->pmegrid_ny;
957 pnz = pme->pmegrid_nz;
959 overlap = pme->pme_order - 1;
961 /* Add periodic overlap in z */
962 for(ix=0; ix<pnx; ix++)
964 for(iy=0; iy<pny; iy++)
966 for(iz=0; iz<overlap; iz++)
968 pmegrid[(ix*pny+iy)*pnz+iz] +=
969 pmegrid[(ix*pny+iy)*pnz+nz+iz];
974 if (pme->nnodes_minor == 1)
976 for(ix=0; ix<pnx; ix++)
978 for(iy=0; iy<overlap; iy++)
980 for(iz=0; iz<nz; iz++)
982 pmegrid[(ix*pny+iy)*pnz+iz] +=
983 pmegrid[(ix*pny+ny+iy)*pnz+iz];
989 if (pme->nnodes_major == 1)
991 ny_x = (pme->nnodes_minor == 1 ? ny : pny);
993 for(ix=0; ix<overlap; ix++)
995 for(iy=0; iy<ny_x; iy++)
997 for(iz=0; iz<nz; iz++)
999 pmegrid[(ix*pny+iy)*pnz+iz] +=
1000 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1008 static void
1009 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1011 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
1013 nx = pme->nkx;
1014 ny = pme->nky;
1015 nz = pme->nkz;
1017 pnx = pme->pmegrid_nx;
1018 pny = pme->pmegrid_ny;
1019 pnz = pme->pmegrid_nz;
1021 overlap = pme->pme_order - 1;
1023 if (pme->nnodes_major == 1)
1025 ny_x = (pme->nnodes_minor == 1 ? ny : pny);
1027 for(ix=0; ix<overlap; ix++)
1029 for(iy=0; iy<ny_x; iy++)
1031 for(iz=0; iz<nz; iz++)
1033 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1034 pmegrid[(ix*pny+iy)*pnz+iz];
1040 if (pme->nnodes_minor == 1)
1042 for(ix=0; ix<pnx; ix++)
1044 for(iy=0; iy<overlap; iy++)
1046 for(iz=0; iz<nz; iz++)
1048 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1049 pmegrid[(ix*pny+iy)*pnz+iz];
1055 /* Copy periodic overlap in z */
1056 for(ix=0; ix<pnx; ix++)
1058 for(iy=0; iy<pny; iy++)
1060 for(iz=0; iz<overlap; iz++)
1062 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1063 pmegrid[(ix*pny+iy)*pnz+iz];
1070 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1071 #define DO_BSPLINE(order) \
1072 for(ithx=0; (ithx<order); ithx++) \
1074 index_x = (i0+ithx)*pny*pnz; \
1075 valx = qn*thx[ithx]; \
1077 for(ithy=0; (ithy<order); ithy++) \
1079 valxy = valx*thy[ithy]; \
1080 index_xy = index_x+(j0+ithy)*pnz; \
1082 for(ithz=0; (ithz<order); ithz++) \
1084 index_xyz = index_xy+(k0+ithz); \
1085 grid[index_xyz] += valxy*thz[ithz]; \
1091 static void spread_q_bsplines(gmx_pme_t pme, pme_atomcomm_t *atc,
1092 real *grid)
1095 /* spread charges from home atoms to local grid */
1096 pme_overlap_t *ol;
1097 int b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
1098 int * idxptr;
1099 int order,norder,index_x,index_xy,index_xyz;
1100 real valx,valxy,qn;
1101 real *thx,*thy,*thz;
1102 int localsize, bndsize;
1104 int pnx,pny,pnz,ndatatot;
1106 pnx = pme->pmegrid_nx;
1107 pny = pme->pmegrid_ny;
1108 pnz = pme->pmegrid_nz;
1109 ndatatot = pnx*pny*pnz;
1111 for(i=0;i<ndatatot;i++)
1113 grid[i] = 0;
1116 order = pme->pme_order;
1118 for(nn=0; (nn<atc->n);nn++)
1120 n = nn;
1121 qn = atc->q[n];
1123 if (qn != 0)
1125 idxptr = atc->idx[n];
1126 norder = n*order;
1128 i0 = idxptr[XX];
1129 j0 = idxptr[YY];
1130 k0 = idxptr[ZZ];
1131 thx = atc->theta[XX] + norder;
1132 thy = atc->theta[YY] + norder;
1133 thz = atc->theta[ZZ] + norder;
1135 switch (order) {
1136 case 4: DO_BSPLINE(4); break;
1137 case 5: DO_BSPLINE(5); break;
1138 default: DO_BSPLINE(order); break;
1145 #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
1146 /* Calculate exponentials through SSE in float precision */
1147 #define CALC_EXPONENTIALS(start,end,r_aligned) \
1149 __m128 tmp_sse; \
1150 for(kx=0; kx<end; kx+=4) \
1152 tmp_sse = _mm_load_ps(r_aligned+kx); \
1153 tmp_sse = gmx_mm_exp_ps_lbc(tmp_sse); \
1154 _mm_store_ps(r_aligned+kx,tmp_sse); \
1157 #else
1158 #define CALC_EXPONENTIALS(start,end,r) \
1159 for(kx=start; kx<end; kx++) \
1161 r[kx] = exp(r[kx]); \
1163 #endif
1166 static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
1167 real ewaldcoeff,real vol,
1168 bool bEnerVir,real *mesh_energy,matrix vir)
1170 /* do recip sum over local cells in grid */
1171 /* y major, z middle, x minor or continuous */
1172 t_complex *p0;
1173 int kx,ky,kz,maxkx,maxky,maxkz;
1174 int nx,ny,nz,iy,iz,kxstart,kxend;
1175 real mx,my,mz;
1176 real factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1177 real ets2,struct2,vfactor,ets2vf;
1178 real eterm,d1,d2,energy=0;
1179 real by,bz;
1180 real virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
1181 real rxx,ryx,ryy,rzx,rzy,rzz;
1182 real *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*m2inv;
1183 real mhxk,mhyk,mhzk,m2k;
1184 real corner_fac;
1185 ivec complex_order;
1186 ivec local_ndata,local_offset,local_size;
1188 nx = pme->nkx;
1189 ny = pme->nky;
1190 nz = pme->nkz;
1192 /* Dimensions should be identical for A/B grid, so we just use A here */
1193 gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1194 complex_order,
1195 local_ndata,
1196 local_offset,
1197 local_size);
1199 rxx = pme->recipbox[XX][XX];
1200 ryx = pme->recipbox[YY][XX];
1201 ryy = pme->recipbox[YY][YY];
1202 rzx = pme->recipbox[ZZ][XX];
1203 rzy = pme->recipbox[ZZ][YY];
1204 rzz = pme->recipbox[ZZ][ZZ];
1206 maxkx = (nx+1)/2;
1207 maxky = (ny+1)/2;
1208 maxkz = nz/2+1;
1210 mhx = pme->work_mhx;
1211 mhy = pme->work_mhy;
1212 mhz = pme->work_mhz;
1213 m2 = pme->work_m2;
1214 denom = pme->work_denom;
1215 tmp1 = pme->work_tmp1;
1216 m2inv = pme->work_m2inv;
1218 for(iy=0;iy<local_ndata[YY];iy++)
1220 ky = iy + local_offset[YY];
1222 if (ky < maxky)
1224 my = ky;
1226 else
1228 my = (ky - ny);
1231 by = M_PI*vol*pme->bsp_mod[YY][ky];
1233 for(iz=0;iz<local_ndata[ZZ];iz++)
1235 kz = iz + local_offset[ZZ];
1237 mz = kz;
1239 bz = pme->bsp_mod[ZZ][kz];
1241 /* 0.5 correction for corner points */
1242 corner_fac = 1;
1243 if (kz == 0)
1244 corner_fac = 0.5;
1245 if (kz == (nz+1)/2)
1246 corner_fac = 0.5;
1248 p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
1250 /* We should skip the k-space point (0,0,0) */
1251 if (local_offset[XX] > 0 ||
1252 local_offset[YY] > 0 || ky > 0 ||
1253 kz > 0)
1255 kxstart = local_offset[XX];
1257 else
1259 kxstart = local_offset[XX] + 1;
1260 p0++;
1262 kxend = local_offset[XX] + local_ndata[XX];
1264 if (bEnerVir)
1266 /* More expensive inner loop, especially because of the storage
1267 * of the mh elements in array's.
1268 * Because x is the minor grid index, all mh elements
1269 * depend on kx for triclinic unit cells.
1272 /* Two explicit loops to avoid a conditional inside the loop */
1273 for(kx=kxstart; kx<maxkx; kx++)
1275 mx = kx;
1277 mhxk = mx * rxx;
1278 mhyk = mx * ryx + my * ryy;
1279 mhzk = mx * rzx + my * rzy + mz * rzz;
1280 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1281 mhx[kx] = mhxk;
1282 mhy[kx] = mhyk;
1283 mhz[kx] = mhzk;
1284 m2[kx] = m2k;
1285 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1286 tmp1[kx] = -factor*m2k;
1289 for(kx=maxkx; kx<kxend; kx++)
1291 mx = (kx - nx);
1293 mhxk = mx * rxx;
1294 mhyk = mx * ryx + my * ryy;
1295 mhzk = mx * rzx + my * rzy + mz * rzz;
1296 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1297 mhx[kx] = mhxk;
1298 mhy[kx] = mhyk;
1299 mhz[kx] = mhzk;
1300 m2[kx] = m2k;
1301 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1302 tmp1[kx] = -factor*m2k;
1305 for(kx=kxstart; kx<kxend; kx++)
1307 m2inv[kx] = 1.0/m2[kx];
1309 for(kx=kxstart; kx<kxend; kx++)
1311 denom[kx] = 1.0/denom[kx];
1314 CALC_EXPONENTIALS(kxstart,kxend,tmp1);
1316 for(kx=kxstart; kx<kxend; kx++,p0++)
1318 d1 = p0->re;
1319 d2 = p0->im;
1321 eterm = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
1323 p0->re = d1*eterm;
1324 p0->im = d2*eterm;
1326 struct2 = 2.0*(d1*d1+d2*d2);
1328 tmp1[kx] = eterm*struct2;
1331 for(kx=kxstart; kx<kxend; kx++)
1333 ets2 = corner_fac*tmp1[kx];
1334 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
1335 energy += ets2;
1337 ets2vf = ets2*vfactor;
1338 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
1339 virxy += ets2vf*mhx[kx]*mhy[kx];
1340 virxz += ets2vf*mhx[kx]*mhz[kx];
1341 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
1342 viryz += ets2vf*mhy[kx]*mhz[kx];
1343 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
1346 else
1348 /* We don't need to calculate the energy and the virial.
1349 * In this case the triclinic overhead is small.
1352 /* Two explicit loops to avoid a conditional inside the loop */
1354 for(kx=kxstart; kx<maxkx; kx++)
1356 mx = kx;
1358 mhxk = mx * rxx;
1359 mhyk = mx * ryx + my * ryy;
1360 mhzk = mx * rzx + my * rzy + mz * rzz;
1361 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1362 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1363 tmp1[kx] = -factor*m2k;
1366 for(kx=maxkx; kx<kxend; kx++)
1368 mx = (kx - nx);
1370 mhxk = mx * rxx;
1371 mhyk = mx * ryx + my * ryy;
1372 mhzk = mx * rzx + my * rzy + mz * rzz;
1373 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1374 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1375 tmp1[kx] = -factor*m2k;
1378 for(kx=kxstart; kx<kxend; kx++)
1380 denom[kx] = 1.0/denom[kx];
1383 CALC_EXPONENTIALS(kxstart,kxend,tmp1);
1385 for(kx=kxstart; kx<kxend; kx++,p0++)
1387 d1 = p0->re;
1388 d2 = p0->im;
1390 eterm = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
1392 p0->re = d1*eterm;
1393 p0->im = d2*eterm;
1399 if (bEnerVir)
1401 /* Update virial with local values.
1402 * The virial is symmetric by definition.
1403 * this virial seems ok for isotropic scaling, but I'm
1404 * experiencing problems on semiisotropic membranes.
1405 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
1407 vir[XX][XX] = 0.25*virxx;
1408 vir[YY][YY] = 0.25*viryy;
1409 vir[ZZ][ZZ] = 0.25*virzz;
1410 vir[XX][YY] = vir[YY][XX] = 0.25*virxy;
1411 vir[XX][ZZ] = vir[ZZ][XX] = 0.25*virxz;
1412 vir[YY][ZZ] = vir[ZZ][YY] = 0.25*viryz;
1414 /* This energy should be corrected for a charged system */
1415 *mesh_energy = 0.5*energy;
1418 /* Return the loop count */
1419 return local_ndata[YY]*local_ndata[ZZ]*local_ndata[XX];
1423 #define DO_FSPLINE(order) \
1424 for(ithx=0; (ithx<order); ithx++) \
1426 index_x = (i0+ithx)*pny*pnz; \
1427 tx = thx[ithx]; \
1428 dx = dthx[ithx]; \
1430 for(ithy=0; (ithy<order); ithy++) \
1432 index_xy = index_x+(j0+ithy)*pnz; \
1433 ty = thy[ithy]; \
1434 dy = dthy[ithy]; \
1435 fxy1 = fz1 = 0; \
1437 for(ithz=0; (ithz<order); ithz++) \
1439 gval = grid[index_xy+(k0+ithz)]; \
1440 fxy1 += thz[ithz]*gval; \
1441 fz1 += dthz[ithz]*gval; \
1443 fx += dx*ty*fxy1; \
1444 fy += tx*dy*fxy1; \
1445 fz += tx*ty*fz1; \
1450 void gather_f_bsplines(gmx_pme_t pme,real *grid,
1451 bool bClearF,pme_atomcomm_t *atc,real scale)
1453 /* sum forces for local particles */
1454 int nn,n,ithx,ithy,ithz,i0,j0,k0;
1455 int index_x,index_xy;
1456 int nx,ny,nz,pnx,pny,pnz;
1457 int * idxptr;
1458 real tx,ty,dx,dy,qn;
1459 real fx,fy,fz,gval;
1460 real fxy1,fz1;
1461 real *thx,*thy,*thz,*dthx,*dthy,*dthz;
1462 int norder;
1463 real rxx,ryx,ryy,rzx,rzy,rzz;
1464 int order;
1466 order = pme->pme_order;
1467 thx = atc->theta[XX];
1468 thy = atc->theta[YY];
1469 thz = atc->theta[ZZ];
1470 dthx = atc->dtheta[XX];
1471 dthy = atc->dtheta[YY];
1472 dthz = atc->dtheta[ZZ];
1473 nx = pme->nkx;
1474 ny = pme->nky;
1475 nz = pme->nkz;
1476 pnx = pme->pmegrid_nx;
1477 pny = pme->pmegrid_ny;
1478 pnz = pme->pmegrid_nz;
1480 rxx = pme->recipbox[XX][XX];
1481 ryx = pme->recipbox[YY][XX];
1482 ryy = pme->recipbox[YY][YY];
1483 rzx = pme->recipbox[ZZ][XX];
1484 rzy = pme->recipbox[ZZ][YY];
1485 rzz = pme->recipbox[ZZ][ZZ];
1487 for(nn=0; (nn<atc->n); nn++)
1489 n = nn;
1490 qn = scale*atc->q[n];
1492 if (bClearF)
1494 atc->f[n][XX] = 0;
1495 atc->f[n][YY] = 0;
1496 atc->f[n][ZZ] = 0;
1498 if (qn != 0)
1500 fx = 0;
1501 fy = 0;
1502 fz = 0;
1503 idxptr = atc->idx[n];
1504 norder = n*order;
1506 i0 = idxptr[XX];
1507 j0 = idxptr[YY];
1508 k0 = idxptr[ZZ];
1510 /* Pointer arithmetic alert, next six statements */
1511 thx = atc->theta[XX] + norder;
1512 thy = atc->theta[YY] + norder;
1513 thz = atc->theta[ZZ] + norder;
1514 dthx = atc->dtheta[XX] + norder;
1515 dthy = atc->dtheta[YY] + norder;
1516 dthz = atc->dtheta[ZZ] + norder;
1518 switch (order) {
1519 case 4: DO_FSPLINE(4); break;
1520 case 5: DO_FSPLINE(5); break;
1521 default: DO_FSPLINE(order); break;
1524 atc->f[n][XX] += -qn*( fx*nx*rxx );
1525 atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
1526 atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
1529 /* Since the energy and not forces are interpolated
1530 * the net force might not be exactly zero.
1531 * This can be solved by also interpolating F, but
1532 * that comes at a cost.
1533 * A better hack is to remove the net force every
1534 * step, but that must be done at a higher level
1535 * since this routine doesn't see all atoms if running
1536 * in parallel. Don't know how important it is? EL 990726
1540 static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
1541 pme_atomcomm_t *atc)
1543 int n,ithx,ithy,ithz,i0,j0,k0;
1544 int index_x,index_xy;
1545 int * idxptr;
1546 real energy,pot,tx,ty,qn,gval;
1547 real *thx,*thy,*thz;
1548 int norder;
1549 int order;
1552 order = pme->pme_order;
1553 thx = atc->theta[XX];
1554 thy = atc->theta[YY];
1555 thz = atc->theta[ZZ];
1557 energy = 0;
1558 for(n=0; (n<atc->n); n++) {
1559 qn = atc->q[n];
1561 if (qn != 0) {
1562 idxptr = atc->idx[n];
1563 norder = n*order;
1565 i0 = idxptr[XX];
1566 j0 = idxptr[YY];
1567 k0 = idxptr[ZZ];
1569 /* Pointer arithmetic alert, next three statements */
1570 thx = atc->theta[XX] + norder;
1571 thy = atc->theta[YY] + norder;
1572 thz = atc->theta[ZZ] + norder;
1574 pot = 0;
1575 for(ithx=0; (ithx<order); ithx++)
1577 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
1578 tx = thx[ithx];
1580 for(ithy=0; (ithy<order); ithy++)
1582 index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
1583 ty = thy[ithy];
1585 for(ithz=0; (ithz<order); ithz++)
1587 gval = grid[index_xy+(k0+ithz)];
1588 pot += tx*ty*thz[ithz]*gval;
1594 energy += pot*qn;
1598 return energy;
1601 void make_bsplines(splinevec theta,splinevec dtheta,int order,
1602 rvec fractx[],int nr,real charge[],
1603 bool bFreeEnergy)
1605 /* construct splines for local atoms */
1606 int i,j,k,l;
1607 real dr,div;
1608 real *data,*ddata,*xptr;
1610 for(i=0; (i<nr); i++) {
1611 /* With free energy we do not use the charge check.
1612 * In most cases this will be more efficient than calling make_bsplines
1613 * twice, since usually more than half the particles have charges.
1615 if (bFreeEnergy || charge[i] != 0.0) {
1616 xptr = fractx[i];
1617 for(j=0; (j<DIM); j++) {
1618 dr = xptr[j];
1620 /* dr is relative offset from lower cell limit */
1621 data=&(theta[j][i*order]);
1622 data[order-1]=0;
1623 data[1]=dr;
1624 data[0]=1-dr;
1626 for(k=3; (k<order); k++) {
1627 div=1.0/(k-1.0);
1628 data[k-1]=div*dr*data[k-2];
1629 for(l=1; (l<(k-1)); l++) {
1630 data[k-l-1]=div*((dr+l)*data[k-l-2]+(k-l-dr)*
1631 data[k-l-1]);
1633 data[0]=div*(1-dr)*data[0];
1635 /* differentiate */
1636 ddata = &(dtheta[j][i*order]);
1637 ddata[0] = -data[0];
1638 for(k=1; (k<order); k++) {
1639 ddata[k]=data[k-1]-data[k];
1642 div=1.0/(order-1);
1643 data[order-1]=div*dr*data[order-2];
1644 for(l=1; (l<(order-1)); l++) {
1645 data[order-l-1]=div*((dr+l)*data[order-l-2]+
1646 (order-l-dr)*data[order-l-1]);
1648 data[0]=div*(1-dr)*data[0];
1655 void make_dft_mod(real *mod,real *data,int ndata)
1657 int i,j;
1658 real sc,ss,arg;
1660 for(i=0;i<ndata;i++) {
1661 sc=ss=0;
1662 for(j=0;j<ndata;j++) {
1663 arg=(2.0*M_PI*i*j)/ndata;
1664 sc+=data[j]*cos(arg);
1665 ss+=data[j]*sin(arg);
1667 mod[i]=sc*sc+ss*ss;
1669 for(i=0;i<ndata;i++)
1670 if(mod[i]<1e-7)
1671 mod[i]=(mod[i-1]+mod[i+1])*0.5;
1676 void make_bspline_moduli(splinevec bsp_mod,int nx,int ny,int nz,int order)
1678 int nmax=max(nx,max(ny,nz));
1679 real *data,*ddata,*bsp_data;
1680 int i,k,l;
1681 real div;
1683 snew(data,order);
1684 snew(ddata,order);
1685 snew(bsp_data,nmax);
1687 data[order-1]=0;
1688 data[1]=0;
1689 data[0]=1;
1691 for(k=3;k<order;k++) {
1692 div=1.0/(k-1.0);
1693 data[k-1]=0;
1694 for(l=1;l<(k-1);l++)
1695 data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
1696 data[0]=div*data[0];
1698 /* differentiate */
1699 ddata[0]=-data[0];
1700 for(k=1;k<order;k++)
1701 ddata[k]=data[k-1]-data[k];
1702 div=1.0/(order-1);
1703 data[order-1]=0;
1704 for(l=1;l<(order-1);l++)
1705 data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
1706 data[0]=div*data[0];
1708 for(i=0;i<nmax;i++)
1709 bsp_data[i]=0;
1710 for(i=1;i<=order;i++)
1711 bsp_data[i]=data[i-1];
1713 make_dft_mod(bsp_mod[XX],bsp_data,nx);
1714 make_dft_mod(bsp_mod[YY],bsp_data,ny);
1715 make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
1717 sfree(data);
1718 sfree(ddata);
1719 sfree(bsp_data);
1722 static void setup_coordinate_communication(pme_atomcomm_t *atc)
1724 int nslab,n,i;
1725 int fw,bw;
1727 nslab = atc->nslab;
1729 n = 0;
1730 for(i=1; i<=nslab/2; i++) {
1731 fw = (atc->nodeid + i) % nslab;
1732 bw = (atc->nodeid - i + nslab) % nslab;
1733 if (n < nslab - 1) {
1734 atc->node_dest[n] = fw;
1735 atc->node_src[n] = bw;
1736 n++;
1738 if (n < nslab - 1) {
1739 atc->node_dest[n] = bw;
1740 atc->node_src[n] = fw;
1741 n++;
1746 int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
1748 if(NULL != log)
1750 fprintf(log,"Destroying PME data structures.\n");
1753 sfree((*pmedata)->nnx);
1754 sfree((*pmedata)->nny);
1755 sfree((*pmedata)->nnz);
1757 sfree((*pmedata)->pmegridA);
1758 sfree((*pmedata)->fftgridA);
1759 sfree((*pmedata)->cfftgridA);
1760 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
1762 if((*pmedata)->pmegridB)
1764 sfree((*pmedata)->pmegridB);
1765 sfree((*pmedata)->fftgridB);
1766 sfree((*pmedata)->cfftgridB);
1767 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
1769 sfree((*pmedata)->work_mhz);
1770 sfree((*pmedata)->work_m2);
1771 sfree((*pmedata)->work_denom);
1772 sfree((*pmedata)->work_tmp1_alloc);
1773 sfree((*pmedata)->work_m2inv);
1775 sfree(*pmedata);
1776 *pmedata = NULL;
1778 return 0;
1781 static int mult_up(int n,int f)
1783 return ((n + f - 1)/f)*f;
1787 static double pme_load_imbalance(gmx_pme_t pme)
1789 int nma,nmi;
1790 double n1,n2,n3;
1792 nma = pme->nnodes_major;
1793 nmi = pme->nnodes_minor;
1795 n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz;
1796 n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky;
1797 n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx;
1799 /* pme_solve is roughly double the cost of an fft */
1801 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
1804 static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
1805 int dimind,bool bSpread)
1807 int nk,k,s;
1809 atc->dimind = dimind;
1810 atc->nslab = 1;
1811 atc->nodeid = 0;
1812 atc->pd_nalloc = 0;
1813 #ifdef GMX_MPI
1814 if (pme->nnodes > 1)
1816 atc->mpi_comm = pme->mpi_comm_d[dimind];
1817 MPI_Comm_size(atc->mpi_comm,&atc->nslab);
1818 MPI_Comm_rank(atc->mpi_comm,&atc->nodeid);
1820 if (debug)
1822 fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid);
1824 #endif
1826 atc->bSpread = bSpread;
1827 atc->pme_order = pme->pme_order;
1829 if (atc->nslab > 1)
1831 /* These three allocations are not required for particle decomp. */
1832 snew(atc->node_dest,atc->nslab);
1833 snew(atc->node_src,atc->nslab);
1834 setup_coordinate_communication(atc);
1836 snew(atc->count,atc->nslab);
1837 snew(atc->rcount,atc->nslab);
1838 snew(atc->buf_index,atc->nslab);
1842 static void
1843 init_overlap_comm(pme_overlap_t * ol,
1844 int norder,
1845 #ifdef GMX_MPI
1846 MPI_Comm comm,
1847 #endif
1848 int nnodes,
1849 int nodeid,
1850 int ndata)
1852 int lbnd,rbnd,maxlr,b,i;
1853 int exten;
1854 int nn,nk;
1855 pme_grid_comm_t *pgc;
1856 bool bCont;
1857 int fft_start,fft_end,send_index1,recv_index1;
1859 #ifdef GMX_MPI
1860 ol->mpi_comm = comm;
1861 #endif
1863 ol->nnodes = nnodes;
1864 ol->nodeid = nodeid;
1866 ol->ndata = ndata;
1868 /* Linear translation of the PME grid wo'nt affect reciprocal space
1869 * calculations, so to optimize we only interpolate "upwards",
1870 * which also means we only have to consider overlap in one direction.
1871 * I.e., particles on this node might also be spread to grid indices
1872 * that belong to higher nodes (modulo nnodes)
1875 snew(ol->s2g0,ol->nnodes+1);
1876 snew(ol->s2g1,ol->nnodes);
1877 if (debug) { fprintf(debug,"PME slab boundaries:"); }
1878 for(i=0; i<nnodes; i++)
1880 /* s2g0 the local interpolation grid start.
1881 * s2g1 the local interpolation grid end.
1882 * Because grid overlap communication only goes forward,
1883 * the grid the slabs for fft's should be rounded down.
1885 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
1886 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
1888 if (debug)
1890 fprintf(debug," %3d %3d",ol->s2g0[i],ol->s2g1[i]);
1893 ol->s2g0[nnodes] = ndata;
1894 if (debug) { fprintf(debug,"\n"); }
1896 /* Determine with how many nodes we need to communicate the grid overlap */
1897 b = 0;
1900 b++;
1901 bCont = FALSE;
1902 for(i=0; i<nnodes; i++)
1904 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
1905 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
1907 bCont = TRUE;
1911 while (bCont && b < nnodes);
1912 ol->noverlap_nodes = b - 1;
1914 snew(ol->send_id,ol->noverlap_nodes);
1915 snew(ol->recv_id,ol->noverlap_nodes);
1916 for(b=0; b<ol->noverlap_nodes; b++)
1918 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
1919 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
1921 snew(ol->comm_data, ol->noverlap_nodes);
1923 for(b=0; b<ol->noverlap_nodes; b++)
1925 pgc = &ol->comm_data[b];
1926 /* Send */
1927 fft_start = ol->s2g0[ol->send_id[b]];
1928 fft_end = ol->s2g0[ol->send_id[b]+1];
1929 if (ol->send_id[b] < nodeid)
1931 fft_start += ol->ndata;
1932 fft_end += ol->ndata;
1934 send_index1 = ol->s2g1[nodeid];
1935 send_index1 = min(send_index1,fft_end);
1936 pgc->send_index0 = fft_start;
1937 pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
1939 /* We always start receiving to the first index of our slab */
1940 fft_start = ol->s2g0[ol->nodeid];
1941 fft_end = ol->s2g0[ol->nodeid+1];
1942 recv_index1 = ol->s2g1[ol->recv_id[b]];
1943 if (ol->recv_id[b] > nodeid)
1945 recv_index1 -= ol->ndata;
1947 recv_index1 = min(recv_index1,fft_end);
1948 pgc->recv_index0 = fft_start;
1949 pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
1953 static void
1954 make_gridindex5_to_localindex(int n,int local_start,int local_end,
1955 int **global_to_local,
1956 real **fraction_shift)
1958 int local_size,i;
1959 int * gtl;
1960 real * fsh;
1962 local_size = local_end - local_start;
1964 snew(gtl,5*n);
1965 snew(fsh,5*n);
1966 for(i=0; (i<5*n); i++) {
1967 /* Determine the global to local grid index */
1968 gtl[i] = (i - local_start + n) % n;
1969 /* For coordinates that fall within the local grid the fraction
1970 * is correct, we don't need to shift it.
1972 fsh[i] = 0;
1973 if (local_size < n)
1975 /* Due to rounding issues i could be 1 beyond the lower or
1976 * upper boundary of the local grid. Correct the index for this.
1977 * If we shift the index, we need to shift the fraction by
1978 * the same amount in the other direction to not affect
1979 * the weights.
1980 * Note that due to this shifting the weights at the end of
1981 * the spline might change, but that will only involve values
1982 * between zero and values close to the precision of a real,
1983 * which is anyhow the accuracy of the whole mesh calculation.
1985 /* With local_size=0 we should not change i=local_start */
1986 if (i % n != local_start)
1988 if (gtl[i] == n-1)
1990 gtl[i] = 0;
1991 fsh[i] = -1;
1993 else if (gtl[i] == local_size)
1995 gtl[i] = local_size - 1;
1996 fsh[i] = 1;
2002 *global_to_local = gtl;
2003 *fraction_shift = fsh;
2006 static void
2007 gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
2009 int nk_new;
2011 if (*nk % nnodes != 0)
2013 nk_new = nnodes*(*nk/nnodes + 1);
2015 if (2*nk_new >= 3*(*nk))
2017 gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
2018 dim,*nk,dim,nnodes,dim);
2021 if (fplog != NULL)
2023 fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
2024 dim,*nk,dim,nnodes,dim,nk_new,dim);
2027 *nk = nk_new;
2031 int gmx_pme_init(gmx_pme_t * pmedata,
2032 t_commrec * cr,
2033 int nnodes_major,
2034 int nnodes_minor,
2035 t_inputrec * ir,
2036 int homenr,
2037 bool bFreeEnergy,
2038 bool bReproducible)
2040 gmx_pme_t pme=NULL;
2042 pme_atomcomm_t *atc;
2043 int bufsizex,bufsizey,bufsize;
2044 ivec ndata;
2046 if (debug)
2047 fprintf(debug,"Creating PME data structures.\n");
2048 snew(pme,1);
2050 pme->redist_init = FALSE;
2051 pme->sum_qgrid_tmp = NULL;
2052 pme->sum_qgrid_dd_tmp = NULL;
2053 pme->buf_nalloc = 0;
2054 pme->redist_buf_nalloc = 0;
2056 pme->nnodes = 1;
2057 pme->bPPnode = TRUE;
2059 pme->nnodes_major = nnodes_major;
2060 pme->nnodes_minor = nnodes_minor;
2062 #ifdef GMX_MPI
2063 if (PAR(cr))
2065 pme->mpi_comm = cr->mpi_comm_mygroup;
2067 MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
2068 MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
2070 #endif
2072 if (pme->nnodes == 1)
2074 pme->ndecompdim = 0;
2075 pme->nodeid_major = 0;
2076 pme->nodeid_minor = 0;
2078 else
2080 if (nnodes_minor == 1)
2082 #ifdef GMX_MPI
2083 pme->mpi_comm_d[0] = pme->mpi_comm;
2084 pme->mpi_comm_d[1] = NULL;
2085 #endif
2086 pme->ndecompdim = 1;
2087 pme->nodeid_major = pme->nodeid;
2088 pme->nodeid_minor = 0;
2091 else if (nnodes_major == 1)
2093 #ifdef GMX_MPI
2094 pme->mpi_comm_d[0] = NULL;
2095 pme->mpi_comm_d[1] = pme->mpi_comm;
2096 #endif
2097 pme->ndecompdim = 1;
2098 pme->nodeid_major = 0;
2099 pme->nodeid_minor = pme->nodeid;
2101 else
2103 if (pme->nnodes % nnodes_major != 0)
2105 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
2107 pme->ndecompdim = 2;
2109 #ifdef GMX_MPI
2110 MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
2111 pme->nodeid,&pme->mpi_comm_d[0]); /* My communicator along major dimension */
2112 MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
2113 pme->nodeid,&pme->mpi_comm_d[1]); /* My communicator along minor dimension */
2115 MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
2116 MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
2117 MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
2118 MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor);
2119 #endif
2121 pme->bPPnode = (cr->duty & DUTY_PP);
2124 if (ir->ePBC == epbcSCREW)
2126 gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
2129 pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
2130 pme->nkx = ir->nkx;
2131 pme->nky = ir->nky;
2132 pme->nkz = ir->nkz;
2133 pme->pme_order = ir->pme_order;
2134 pme->epsilon_r = ir->epsilon_r;
2136 /* Currently pme.c supports only the fft5d FFT code.
2137 * Therefore the grid always needs to be divisible by nnodes.
2138 * When the old 1D code is also supported again, change this check.
2140 * This check should be done before calling gmx_pme_init
2141 * and fplog should be passed iso stderr.
2143 if (pme->ndecompdim >= 2)
2145 if (pme->ndecompdim >= 1)
2148 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
2149 'x',nnodes_major,&pme->nkx);
2150 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
2151 'y',nnodes_minor,&pme->nky);
2155 if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
2156 pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
2157 pme->nkz <= pme->pme_order)
2159 gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_ordern for x and/or y",pme->pme_order);
2162 if (pme->nnodes > 1) {
2163 double imbal;
2165 #ifdef GMX_MPI
2166 MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
2167 MPI_Type_commit(&(pme->rvec_mpi));
2168 #endif
2170 /* Note that the charge spreading and force gathering, which usually
2171 * takes about the same amount of time as FFT+solve_pme,
2172 * is always fully load balanced
2173 * (unless the charge distribution is inhomogeneous).
2176 imbal = pme_load_imbalance(pme);
2177 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
2179 fprintf(stderr,
2180 "\n"
2181 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
2182 " For optimal PME load balancing\n"
2183 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
2184 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
2185 "\n",
2186 (int)((imbal-1)*100 + 0.5),
2187 pme->nkx,pme->nky,pme->nnodes_major,
2188 pme->nky,pme->nkz,pme->nnodes_minor);
2192 init_overlap_comm(&pme->overlap[0],pme->pme_order,
2193 #ifdef GMX_MPI
2194 pme->mpi_comm_d[0],
2195 #endif
2196 pme->nnodes_major,pme->nodeid_major,pme->nkx);
2198 init_overlap_comm(&pme->overlap[1],pme->pme_order,
2199 #ifdef GMX_MPI
2200 pme->mpi_comm_d[1],
2201 #endif
2202 pme->nnodes_minor,pme->nodeid_minor,pme->nky);
2204 snew(pme->bsp_mod[XX],pme->nkx);
2205 snew(pme->bsp_mod[YY],pme->nky);
2206 snew(pme->bsp_mod[ZZ],pme->nkz);
2208 /* Allocate data for the interpolation grid, including overlap */
2209 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
2210 pme->overlap[0].s2g0[pme->nodeid_major];
2211 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
2212 pme->overlap[1].s2g0[pme->nodeid_minor];
2213 pme->pmegrid_nz = pme->nkz + pme->pme_order - 1;
2215 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
2216 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
2217 pme->pmegrid_start_iz = 0;
2219 make_gridindex5_to_localindex(pme->nkx,
2220 pme->pmegrid_start_ix,
2221 pme->overlap[0].s2g0[pme->nodeid_major+1],
2222 &pme->nnx,&pme->fshx);
2223 make_gridindex5_to_localindex(pme->nky,
2224 pme->pmegrid_start_iy,
2225 pme->overlap[1].s2g0[pme->nodeid_minor+1],
2226 &pme->nny,&pme->fshy);
2227 make_gridindex5_to_localindex(pme->nkz,
2228 pme->pmegrid_start_iz,
2229 pme->nkz,
2230 &pme->nnz,&pme->fshz);
2232 snew(pme->pmegridA,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
2234 /* For non-divisible grid we need pme_order iso pme_order-1 */
2235 bufsizex = pme->pme_order*pme->pmegrid_ny*pme->pmegrid_nz;
2236 bufsizey = pme->pmegrid_nx*pme->pme_order*pme->nkz;
2237 bufsize = (bufsizex>bufsizey) ? bufsizex : bufsizey;
2239 snew(pme->pmegrid_sendbuf,bufsize);
2240 snew(pme->pmegrid_recvbuf,bufsize);
2242 ndata[0] = pme->nkx;
2243 ndata[1] = pme->nky;
2244 ndata[2] = pme->nkz;
2246 /* This routine will allocate the grid data to fit the FFTs */
2247 gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
2248 &pme->fftgridA,&pme->cfftgridA,
2249 pme->mpi_comm_d,
2250 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
2251 bReproducible);
2253 if (bFreeEnergy)
2255 snew(pme->pmegridB,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
2256 gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
2257 &pme->fftgridB,&pme->cfftgridB,
2258 pme->mpi_comm_d,
2259 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
2260 bReproducible);
2261 } else
2263 pme->pmegridB = NULL;
2264 pme->fftgridB = NULL;
2265 pme->cfftgridB = NULL;
2268 make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
2270 /* Use atc[0] for spreading */
2271 init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
2272 if (pme->ndecompdim >= 2)
2274 init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
2277 if (pme->nnodes == 1) {
2278 pme->atc[0].n = homenr;
2279 pme_realloc_atomcomm_things(&pme->atc[0]);
2282 /* Use fft5d, order after FFT is y major, z, x minor */
2283 pme->work_nalloc = pme->nkx;
2284 snew(pme->work_mhx,pme->work_nalloc);
2285 snew(pme->work_mhy,pme->work_nalloc);
2286 snew(pme->work_mhz,pme->work_nalloc);
2287 snew(pme->work_m2,pme->work_nalloc);
2288 snew(pme->work_denom,pme->work_nalloc);
2289 /* Allocate an aligned pointer for SSE operations, including 3 extra
2290 * elements at the end since SSE operates on 4 elements at a time.
2292 snew(pme->work_tmp1_alloc,pme->work_nalloc+8);
2293 pme->work_tmp1 = (real *) (((size_t) pme->work_tmp1_alloc + 16) & (~((size_t) 15)));
2294 snew(pme->work_m2inv,pme->work_nalloc);
2296 *pmedata = pme;
2298 return 0;
2301 static void spread_on_grid(gmx_pme_t pme,
2302 pme_atomcomm_t *atc,real *grid,
2303 bool bCalcSplines,bool bSpread)
2305 if (bCalcSplines)
2308 /* Compute fftgrid index for all atoms,
2309 * with help of some extra variables.
2311 calc_interpolation_idx(pme,atc);
2313 /* make local bsplines */
2314 make_bsplines(atc->theta,atc->dtheta,pme->pme_order,
2315 atc->fractx,atc->n,atc->q,pme->bFEP);
2318 if (bSpread)
2320 /* put local atoms on grid. */
2321 spread_q_bsplines(pme,atc,grid);
2325 void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
2327 pme_atomcomm_t *atc;
2328 real *grid;
2330 if (pme->nnodes > 1)
2332 gmx_incons("gmx_pme_calc_energy called in parallel");
2334 if (pme->bFEP > 1)
2336 gmx_incons("gmx_pme_calc_energy with free energy");
2339 atc = &pme->atc_energy;
2340 atc->nslab = 1;
2341 atc->bSpread = TRUE;
2342 atc->pme_order = pme->pme_order;
2343 atc->n = n;
2344 pme_realloc_atomcomm_things(atc);
2345 atc->x = x;
2346 atc->q = q;
2348 /* We only use the A-charges grid */
2349 grid = pme->pmegridA;
2351 spread_on_grid(pme,atc,grid,TRUE,FALSE);
2353 *V = gather_energy_bsplines(pme,grid,atc);
2357 static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
2358 t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel)
2360 /* Reset all the counters related to performance over the run */
2361 wallcycle_stop(wcycle,ewcRUN);
2362 wallcycle_reset_all(wcycle);
2363 init_nrnb(nrnb);
2364 ir->init_step += step_rel;
2365 ir->nsteps -= step_rel;
2366 wallcycle_start(wcycle,ewcRUN);
2370 int gmx_pmeonly(gmx_pme_t pme,
2371 t_commrec *cr, t_nrnb *nrnb,
2372 gmx_wallcycle_t wcycle,
2373 real ewaldcoeff, bool bGatherOnly,
2374 t_inputrec *ir)
2376 gmx_pme_pp_t pme_pp;
2377 int natoms;
2378 matrix box;
2379 rvec *x_pp=NULL,*f_pp=NULL;
2380 real *chargeA=NULL,*chargeB=NULL;
2381 real lambda=0;
2382 int maxshift0=0,maxshift1=0;
2383 real energy,dvdlambda;
2384 matrix vir;
2385 float cycles;
2386 int count;
2387 bool bEnerVir;
2388 gmx_large_int_t step,step_rel;
2391 pme_pp = gmx_pme_pp_init(cr);
2393 init_nrnb(nrnb);
2395 count = 0;
2396 do /****** this is a quasi-loop over time steps! */
2398 /* Domain decomposition */
2399 natoms = gmx_pme_recv_q_x(pme_pp,
2400 &chargeA,&chargeB,box,&x_pp,&f_pp,
2401 &maxshift0,&maxshift1,
2402 &pme->bFEP,&lambda,
2403 &bEnerVir,
2404 &step);
2406 if (natoms == -1) {
2407 /* We should stop: break out of the loop */
2408 break;
2411 step_rel = step - ir->init_step;
2413 if (count == 0)
2414 wallcycle_start(wcycle,ewcRUN);
2416 wallcycle_start(wcycle,ewcPMEMESH);
2418 dvdlambda = 0;
2419 clear_mat(vir);
2420 gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
2421 cr,maxshift0,maxshift1,nrnb,wcycle,vir,ewaldcoeff,
2422 &energy,lambda,&dvdlambda,
2423 GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
2425 cycles = wallcycle_stop(wcycle,ewcPMEMESH);
2427 gmx_pme_send_force_vir_ener(pme_pp,
2428 f_pp,vir,energy,dvdlambda,
2429 cycles);
2431 count++;
2433 if (step_rel == wcycle_get_reset_counters(wcycle))
2435 /* Reset all the counters related to performance over the run */
2436 reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
2437 wcycle_set_reset_counters(wcycle, 0);
2440 } /***** end of quasi-loop, we stop with the break above */
2441 while (TRUE);
2443 return 0;
2446 int gmx_pme_do(gmx_pme_t pme,
2447 int start, int homenr,
2448 rvec x[], rvec f[],
2449 real *chargeA, real *chargeB,
2450 matrix box, t_commrec *cr,
2451 int maxshift0, int maxshift1,
2452 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
2453 matrix vir, real ewaldcoeff,
2454 real *energy, real lambda,
2455 real *dvdlambda, int flags)
2457 int q,d,i,j,ntot,npme;
2458 int nx,ny,nz;
2459 int n_d,local_ny;
2460 int loop_count;
2461 pme_atomcomm_t *atc=NULL;
2462 real * grid=NULL;
2463 real *ptr;
2464 rvec *x_d,*f_d;
2465 real *charge=NULL,*q_d,vol;
2466 real energy_AB[2];
2467 matrix vir_AB[2];
2468 bool bClearF;
2469 gmx_parallel_3dfft_t pfft_setup;
2470 real * fftgrid;
2471 t_complex * cfftgrid;
2473 if (pme->nnodes > 1) {
2474 atc = &pme->atc[0];
2475 atc->npd = homenr;
2476 if (atc->npd > atc->pd_nalloc) {
2477 atc->pd_nalloc = over_alloc_dd(atc->npd);
2478 srenew(atc->pd,atc->pd_nalloc);
2480 atc->maxshift = (atc->dimind==0 ? maxshift0 : maxshift1);
2483 for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
2484 if (q == 0) {
2485 grid = pme->pmegridA;
2486 fftgrid = pme->fftgridA;
2487 cfftgrid = pme->cfftgridA;
2488 pfft_setup = pme->pfft_setupA;
2489 charge = chargeA+start;
2490 } else {
2491 grid = pme->pmegridB;
2492 fftgrid = pme->fftgridB;
2493 cfftgrid = pme->cfftgridB;
2494 pfft_setup = pme->pfft_setupB;
2495 charge = chargeB+start;
2497 /* Unpack structure */
2498 if (debug) {
2499 fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
2500 cr->nnodes,cr->nodeid);
2501 fprintf(debug,"Grid = %p\n",(void*)grid);
2502 if (grid == NULL)
2503 gmx_fatal(FARGS,"No grid!");
2505 where();
2507 m_inv_ur0(box,pme->recipbox);
2509 if (pme->nnodes == 1) {
2510 atc = &pme->atc[0];
2511 if (DOMAINDECOMP(cr)) {
2512 atc->n = homenr;
2513 pme_realloc_atomcomm_things(atc);
2515 atc->x = x;
2516 atc->q = charge;
2517 atc->f = f;
2518 } else {
2519 wallcycle_start(wcycle,ewcPME_REDISTXF);
2520 for(d=pme->ndecompdim-1; d>=0; d--)
2522 if (d == pme->ndecompdim-1)
2524 n_d = homenr;
2525 x_d = x + start;
2526 q_d = charge;
2528 else
2530 n_d = pme->atc[d+1].n;
2531 x_d = atc->x;
2532 q_d = atc->q;
2534 atc = &pme->atc[d];
2535 atc->npd = n_d;
2536 if (atc->npd > atc->pd_nalloc) {
2537 atc->pd_nalloc = over_alloc_dd(atc->npd);
2538 srenew(atc->pd,atc->pd_nalloc);
2540 atc->maxshift = (atc->dimind==0 ? maxshift0 : maxshift1);
2541 pme_calc_pidx(n_d,pme->recipbox,x_d,atc);
2542 where();
2544 GMX_BARRIER(cr->mpi_comm_mygroup);
2545 /* Redistribute x (only once) and qA or qB */
2546 if (DOMAINDECOMP(cr)) {
2547 dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc);
2548 } else {
2549 pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc);
2552 where();
2554 wallcycle_stop(wcycle,ewcPME_REDISTXF);
2557 if (debug)
2558 fprintf(debug,"Node= %6d, pme local particles=%6d\n",
2559 cr->nodeid,atc->n);
2561 if (flags & GMX_PME_SPREAD_Q)
2563 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
2565 /* Spread the charges on a grid */
2566 GMX_MPE_LOG(ev_spread_on_grid_start);
2568 /* Spread the charges on a grid */
2569 spread_on_grid(pme,&pme->atc[0],grid,q==0,TRUE);
2570 GMX_MPE_LOG(ev_spread_on_grid_finish);
2572 if (q == 0)
2574 inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n);
2576 inc_nrnb(nrnb,eNR_SPREADQBSP,
2577 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
2579 wrap_periodic_pmegrid(pme,grid);
2581 /* sum contributions to local grid from other nodes */
2582 #ifdef GMX_MPI
2583 if (pme->nnodes > 1) {
2584 GMX_BARRIER(cr->mpi_comm_mygroup);
2585 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
2586 where();
2588 #endif
2589 where();
2591 copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
2593 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
2596 if (flags & GMX_PME_SOLVE)
2598 /* do 3d-fft */
2599 GMX_BARRIER(cr->mpi_comm_mygroup);
2600 GMX_MPE_LOG(ev_gmxfft3d_start);
2601 wallcycle_start(wcycle,ewcPME_FFT);
2602 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,fftgrid,cfftgrid);
2603 wallcycle_stop(wcycle,ewcPME_FFT);
2604 GMX_MPE_LOG(ev_gmxfft3d_finish);
2605 where();
2607 /* solve in k-space for our local cells */
2608 vol = det(box);
2609 GMX_BARRIER(cr->mpi_comm_mygroup);
2610 GMX_MPE_LOG(ev_solve_pme_start);
2611 wallcycle_start(wcycle,ewcPME_SOLVE);
2612 loop_count =
2613 solve_pme_yzx(pme,cfftgrid,ewaldcoeff,vol,
2614 flags & GMX_PME_CALC_ENER_VIR,
2615 &energy_AB[q],vir_AB[q]);
2616 wallcycle_stop(wcycle,ewcPME_SOLVE);
2617 where();
2618 GMX_MPE_LOG(ev_solve_pme_finish);
2619 inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
2622 if (flags & GMX_PME_CALC_F)
2625 /* do 3d-invfft */
2626 GMX_BARRIER(cr->mpi_comm_mygroup);
2627 GMX_MPE_LOG(ev_gmxfft3d_start);
2628 where();
2629 wallcycle_start(wcycle,ewcPME_FFT);
2630 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,cfftgrid,fftgrid);
2631 wallcycle_stop(wcycle,ewcPME_FFT);
2633 where();
2634 GMX_MPE_LOG(ev_gmxfft3d_finish);
2636 if (pme->nodeid == 0)
2638 ntot = pme->nkx*pme->nky*pme->nkz;
2639 npme = ntot*log((real)ntot)/log(2.0);
2640 inc_nrnb(nrnb,eNR_FFT,2*npme);
2643 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
2645 copy_fftgrid_to_pmegrid(pme,fftgrid,grid);
2647 /* distribute local grid to all nodes */
2648 #ifdef GMX_MPI
2649 if (pme->nnodes > 1) {
2650 GMX_BARRIER(cr->mpi_comm_mygroup);
2651 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD);
2653 #endif
2654 where();
2656 unwrap_periodic_pmegrid(pme,grid);
2658 /* interpolate forces for our local atoms */
2659 GMX_BARRIER(cr->mpi_comm_mygroup);
2660 GMX_MPE_LOG(ev_gather_f_bsplines_start);
2662 where();
2664 /* If we are running without parallelization,
2665 * atc->f is the actual force array, not a buffer,
2666 * therefore we should not clear it.
2668 bClearF = (q == 0 && PAR(cr));
2669 gather_f_bsplines(pme,grid,bClearF,&pme->atc[0],
2670 pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
2671 where();
2673 GMX_MPE_LOG(ev_gather_f_bsplines_finish);
2675 inc_nrnb(nrnb,eNR_GATHERFBSP,
2676 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
2677 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
2679 } /* of q-loop */
2681 if ((flags & GMX_PME_CALC_F) && pme->nnodes > 1) {
2682 wallcycle_start(wcycle,ewcPME_REDISTXF);
2683 for(d=0; d<pme->ndecompdim; d++)
2685 atc = &pme->atc[d];
2686 if (d == pme->ndecompdim - 1)
2688 n_d = homenr;
2689 f_d = f + start;
2691 else
2693 n_d = pme->atc[d+1].n;
2694 f_d = pme->atc[d+1].f;
2696 GMX_BARRIER(cr->mpi_comm_mygroup);
2697 if (DOMAINDECOMP(cr)) {
2698 dd_pmeredist_f(pme,atc,n_d,f_d,
2699 d==pme->ndecompdim-1 && pme->bPPnode);
2700 } else {
2701 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
2705 wallcycle_stop(wcycle,ewcPME_REDISTXF);
2707 where();
2709 if (!pme->bFEP) {
2710 *energy = energy_AB[0];
2711 m_add(vir,vir_AB[0],vir);
2712 } else {
2713 *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
2714 *dvdlambda += energy_AB[1] - energy_AB[0];
2715 for(i=0; i<DIM; i++)
2716 for(j=0; j<DIM; j++)
2717 vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] + lambda*vir_AB[1][i][j];
2720 if (debug)
2721 fprintf(debug,"PME mesh energy: %g\n",*energy);
2723 return 0;