src/mdlib/pme.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  *                        VERSION 3.2.0
  11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  13  * Copyright (c) 2001-2004, The GROMACS development team,
  14  * check out http://www.gromacs.org for more information.
  15
  16  * This program is free software; you can redistribute it and/or
  17  * modify it under the terms of the GNU General Public License
  18  * as published by the Free Software Foundation; either version 2
  19  * of the License, or (at your option) any later version.
  20  *
  21  * If you want to redistribute modifications, please consider that
  22  * scientific software is very special. Version control is crucial -
  23  * bugs must be traceable. We will be happy to consider code for
  24  * inclusion in the official distribution, but derived work must not
  25  * be called official GROMACS. Details are found in the README & COPYING
  26  * files - if they are missing, get the official version at www.gromacs.org.
  27  *
  28  * To help us fund GROMACS development, we humbly ask that you cite
  29  * the papers on the package - you can find them in the top README file.
  30  *
  31  * For more info, check our website at http://www.gromacs.org
  32  *
  33  * And Hey:
  34  * GROwing Monsters And Cloning Shrimps
  35  */
  36 /* IMPORTANT FOR DEVELOPERS:
  37  *
  38  * Triclinic pme stuff isn't entirely trivial, and we've experienced
  39  * some bugs during development (many of them due to me). To avoid
  40  * this in the future, please check the following things if you make
  41  * changes in this file:
  42  *
  43  * 1. You should obtain identical (at least to the PME precision)
  44  *    energies, forces, and virial for
  45  *    a rectangular box and a triclinic one where the z (or y) axis is
  46  *    tilted a whole box side. For instance you could use these boxes:
  47  *
  48  *    rectangular       triclinic
  49  *     2  0  0           2  0  0
  50  *     0  2  0           0  2  0
  51  *     0  0  6           2  2  6
  52  *
  53  * 2. You should check the energy conservation in a triclinic box.
  54  *
  55  * It might seem an overkill, but better safe than sorry.
  56  * /Erik 001109
  57  */
  58
  59 #ifdef HAVE_CONFIG_H
  60 #include <config.h>
  61 #endif
  62
  63 #ifdef GMX_LIB_MPI
  64 #include <mpi.h>
  65 #endif
  66 #ifdef GMX_THREADS
  67 #include "tmpi.h"
  68 #endif
  69
  70
  71 #include <stdio.h>
  72 #include <string.h>
  73 #include <math.h>
  74 #include "typedefs.h"
  75 #include "txtdump.h"
  76 #include "vec.h"
  77 #include "gmxcomplex.h"
  78 #include "smalloc.h"
  79 #include "futil.h"
  80 #include "coulomb.h"
  81 #include "gmx_fatal.h"
  82 #include "pme.h"
  83 #include "network.h"
  84 #include "physics.h"
  85 #include "nrnb.h"
  86 #include "copyrite.h"
  87 #include "gmx_wallcycle.h"
  88 #include "gmx_parallel_3dfft.h"
  89 #include "pdbio.h"
  90
  91 #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
  92 #include "gmx_sse2_single.h"
  93 #endif
  94
  95 #include "mpelogging.h"
  96
  97 #define DFT_TOL 1e-7
  98 /* #define PRT_FORCE */
  99 /* conditions for on the fly time-measurement */
 100 /* #define TAKETIME (step > 1 && timesteps < 10) */
 101 #define TAKETIME FALSE
 102
 103 #ifdef GMX_DOUBLE
 104 #define mpi_type MPI_DOUBLE
 105 #else
 106 #define mpi_type MPI_FLOAT
 107 #endif
 108
 109 /* Internal datastructures */
 110 typedef struct {
 111     int send_index0;
 112     int send_nindex;
 113     int recv_index0;
 114     int recv_nindex;
 115 } pme_grid_comm_t;
 116
 117 typedef struct {
 118 #ifdef GMX_MPI
 119     MPI_Comm mpi_comm;
 120 #endif
 121     int  nnodes,nodeid;
 122     int  ndata;
 123     int  *s2g0;
 124     int  *s2g1;
 125     int  noverlap_nodes;
 126     int  *send_id,*recv_id;
 127     pme_grid_comm_t *comm_data;
 128 } pme_overlap_t;
 129
 130 typedef struct {
 131     int  dimind;            /* The index of the dimension, 0=x, 1=y */
 132     int  nslab;
 133     int  nodeid;
 134 #ifdef GMX_MPI
 135     MPI_Comm mpi_comm;
 136 #endif
 137
 138     int  *node_dest;        /* The nodes to send x and q to with DD */
 139     int  *node_src;         /* The nodes to receive x and q from with DD */
 140     int  *buf_index;        /* Index for commnode into the buffers */
 141
 142     int  maxshift;
 143
 144     int  npd;
 145     int  pd_nalloc;
 146     int  *pd;
 147     int  *count;            /* The number of atoms to send to each node */
 148     int  *rcount;           /* The number of atoms to receive */
 149
 150     int  n;
 151     int  nalloc;
 152     rvec *x;
 153     real *q;
 154     rvec *f;
 155     bool bSpread;           /* These coordinates are used for spreading */
 156     int  pme_order;
 157     splinevec theta,dtheta;
 158     ivec *idx;
 159     rvec *fractx;            /* Fractional coordinate relative to the
 160                               * lower cell boundary
 161                               */
 162 } pme_atomcomm_t;
 163
 164 typedef struct gmx_pme {
 165     int  ndecompdim;         /* The number of decomposition dimensions */
 166     int  nodeid;             /* Our nodeid in mpi->mpi_comm */
 167     int  nodeid_major;
 168     int  nodeid_minor;
 169     int  nnodes;             /* The number of nodes doing PME */
 170     int  nnodes_major;
 171     int  nnodes_minor;
 172
 173     MPI_Comm mpi_comm;
 174     MPI_Comm mpi_comm_d[2];  /* Indexed on dimension, 0=x, 1=y */
 175 #ifdef GMX_MPI
 176     MPI_Datatype  rvec_mpi;  /* the pme vector's MPI type */
 177 #endif
 178
 179     bool bPPnode;            /* Node also does particle-particle forces */
 180     bool bFEP;               /* Compute Free energy contribution */
 181     int nkx,nky,nkz;         /* Grid dimensions */
 182     int pme_order;
 183     real epsilon_r;
 184
 185     real *  pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
 186     real *  pmegridB;
 187     int     pmegrid_nx,pmegrid_ny,pmegrid_nz;
 188     int     pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
 189
 190     real *  pmegrid_sendbuf;
 191     real *  pmegrid_recvbuf;
 192
 193     real *fftgridA;             /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
 194     real *fftgridB;             /* inside the interpolation grid, but separate for 2D PME decomp. */
 195     int   fftgrid_nx,fftgrid_ny,fftgrid_nz;
 196
 197     t_complex *cfftgridA;             /* Grids for complex FFT data */
 198     t_complex *cfftgridB;
 199     int   cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
 200
 201     gmx_parallel_3dfft_t  pfft_setupA;
 202     gmx_parallel_3dfft_t  pfft_setupB;
 203
 204     int  *nnx,*nny,*nnz;
 205     real *fshx,*fshy,*fshz;
 206
 207     pme_atomcomm_t atc[2];  /* Indexed on decomposition index */
 208     matrix    recipbox;
 209     splinevec bsp_mod;
 210
 211     pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
 212
 213
 214     pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
 215
 216     rvec *bufv;             /* Communication buffer */
 217     real *bufr;             /* Communication buffer */
 218     int  buf_nalloc;        /* The communication buffer size */
 219
 220     /* work data for solve_pme */
 221     int      work_nalloc;
 222     real *   work_mhx;
 223     real *   work_mhy;
 224     real *   work_mhz;
 225     real *   work_m2;
 226     real *   work_denom;
 227     real *   work_tmp1_alloc;
 228     real *   work_tmp1;
 229     real *   work_m2inv;
 230
 231     /* Work data for PME_redist */
 232     bool     redist_init;
 233     int *    scounts;
 234     int *    rcounts;
 235     int *    sdispls;
 236     int *    rdispls;
 237     int *    sidx;
 238     int *    idxa;
 239     real *   redist_buf;
 240     int      redist_buf_nalloc;
 241
 242     /* Work data for sum_qgrid */
 243     real *   sum_qgrid_tmp;
 244     real *   sum_qgrid_dd_tmp;
 245 } t_gmx_pme;
 246
 247
 248 static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc)
 249 {
 250     int  i;
 251     int  *idxptr,tix,tiy,tiz;
 252     real *xptr,*fptr,tx,ty,tz;
 253     real rxx,ryx,ryy,rzx,rzy,rzz;
 254     int  nx,ny,nz;
 255     int  start_ix,start_iy,start_iz;
 256
 257     nx  = pme->nkx;
 258     ny  = pme->nky;
 259     nz  = pme->nkz;
 260
 261     start_ix = pme->pmegrid_start_ix;
 262     start_iy = pme->pmegrid_start_iy;
 263     start_iz = pme->pmegrid_start_iz;
 264
 265     rxx = pme->recipbox[XX][XX];
 266     ryx = pme->recipbox[YY][XX];
 267     ryy = pme->recipbox[YY][YY];
 268     rzx = pme->recipbox[ZZ][XX];
 269     rzy = pme->recipbox[ZZ][YY];
 270     rzz = pme->recipbox[ZZ][ZZ];
 271
 272     for(i=0; (i<atc->n); i++) {
 273         xptr   = atc->x[i];
 274         idxptr = atc->idx[i];
 275         fptr   = atc->fractx[i];
 276
 277         /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
 278         tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
 279         ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
 280         tz = nz * (                                   xptr[ZZ] * rzz + 2.0 );
 281
 282         tix = (int)(tx);
 283         tiy = (int)(ty);
 284         tiz = (int)(tz);
 285
 286         /* Because decomposition only occurs in x and y,
 287          * we never have a fraction correction in z.
 288          */
 289         fptr[XX] = tx - tix + pme->fshx[tix];
 290         fptr[YY] = ty - tiy + pme->fshy[tiy];
 291         fptr[ZZ] = tz - tiz;
 292
 293         idxptr[XX] = pme->nnx[tix];
 294         idxptr[YY] = pme->nny[tiy];
 295         idxptr[ZZ] = pme->nnz[tiz];
 296
 297 #ifdef DEBUG
 298         range_check(idxptr[XX],0,pme->pmegrid_nx);
 299         range_check(idxptr[YY],0,pme->pmegrid_ny);
 300         range_check(idxptr[ZZ],0,pme->pmegrid_nz);
 301 #endif
 302   }
 303 }
 304
 305 static void pme_calc_pidx(int natoms, matrix recipbox, rvec x[],
 306                           pme_atomcomm_t *atc)
 307 {
 308     int  nslab,i;
 309     int  si;
 310     real *xptr,s;
 311     real rxx,ryx,rzx,ryy,rzy;
 312     int *pd,*count;
 313
 314     /* Calculate PME task index (pidx) for each grid index.
 315      * Here we always assign equally sized slabs to each node
 316      * for load balancing reasons (the PME grid spacing is not used).
 317      */
 318
 319     nslab = atc->nslab;
 320     pd    = atc->pd;
 321     count = atc->count;
 322
 323     /* Reset the count */
 324     for(i=0; i<nslab; i++)
 325     {
 326         count[i] = 0;
 327     }
 328
 329     if (atc->dimind == 0)
 330     {
 331         rxx = recipbox[XX][XX];
 332         ryx = recipbox[YY][XX];
 333         rzx = recipbox[ZZ][XX];
 334         /* Calculate the node index in x-dimension */
 335         for(i=0; (i<natoms); i++)
 336         {
 337             xptr   = x[i];
 338             /* Fractional coordinates along box vectors */
 339             s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
 340             si = (int)(s + 2*nslab) % nslab;
 341             pd[i] = si;
 342             count[si]++;
 343         }
 344     }
 345     else
 346     {
 347         ryy = recipbox[YY][YY];
 348         rzy = recipbox[ZZ][YY];
 349         /* Calculate the node index in y-dimension */
 350         for(i=0; (i<natoms); i++)
 351         {
 352             xptr   = x[i];
 353             /* Fractional coordinates along box vectors */
 354             s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
 355             si = (int)(s + 2*nslab) % nslab;
 356             pd[i] = si;
 357             count[si]++;
 358         }
 359     }
 360 }
 361
 362 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
 363 {
 364     int nalloc_old,i;
 365
 366     if (atc->n > atc->nalloc) {
 367         nalloc_old = atc->nalloc;
 368         atc->nalloc = over_alloc_dd(atc->n);
 369
 370         if (atc->nslab > 1) {
 371             srenew(atc->x,atc->nalloc);
 372             srenew(atc->q,atc->nalloc);
 373             srenew(atc->f,atc->nalloc);
 374             for(i=nalloc_old; i<atc->nalloc; i++)
 375             {
 376                 clear_rvec(atc->f[i]);
 377             }
 378         }
 379         if (atc->bSpread) {
 380             for(i=0;i<DIM;i++) {
 381                 srenew(atc->theta[i] ,atc->pme_order*atc->nalloc);
 382                 srenew(atc->dtheta[i],atc->pme_order*atc->nalloc);
 383             }
 384             srenew(atc->fractx,atc->nalloc);
 385             srenew(atc->idx   ,atc->nalloc);
 386         }
 387     }
 388 }
 389
 390 static void pmeredist_pd(gmx_pme_t pme, bool forw,
 391                          int n, bool bXF, rvec *x_f, real *charge,
 392                          pme_atomcomm_t *atc)
 393 /* Redistribute particle data for PME calculation */
 394 /* domain decomposition by x coordinate           */
 395 {
 396     int *idxa;
 397     int i, ii;
 398
 399     if(FALSE == pme->redist_init) {
 400         snew(pme->scounts,atc->nslab);
 401         snew(pme->rcounts,atc->nslab);
 402         snew(pme->sdispls,atc->nslab);
 403         snew(pme->rdispls,atc->nslab);
 404         snew(pme->sidx,atc->nslab);
 405         pme->redist_init = TRUE;
 406     }
 407     if (n > pme->redist_buf_nalloc) {
 408         pme->redist_buf_nalloc = over_alloc_dd(n);
 409         srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
 410     }
 411
 412     pme->idxa = atc->pd;
 413
 414 #ifdef GMX_MPI
 415     if (forw && bXF) {
 416         /* forward, redistribution from pp to pme */
 417
 418         /* Calculate send counts and exchange them with other nodes */
 419         for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
 420         for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
 421         MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
 422
 423         /* Calculate send and receive displacements and index into send
 424            buffer */
 425         pme->sdispls[0]=0;
 426         pme->rdispls[0]=0;
 427         pme->sidx[0]=0;
 428         for(i=1; i<atc->nslab; i++) {
 429             pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1];
 430             pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1];
 431             pme->sidx[i]=pme->sdispls[i];
 432         }
 433         /* Total # of particles to be received */
 434         atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
 435
 436         pme_realloc_atomcomm_things(atc);
 437
 438         /* Copy particle coordinates into send buffer and exchange*/
 439         for(i=0; (i<n); i++) {
 440             ii=DIM*pme->sidx[pme->idxa[i]];
 441             pme->sidx[pme->idxa[i]]++;
 442             pme->redist_buf[ii+XX]=x_f[i][XX];
 443             pme->redist_buf[ii+YY]=x_f[i][YY];
 444             pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
 445         }
 446         MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
 447                       pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
 448                       pme->rvec_mpi, atc->mpi_comm);
 449     }
 450     if (forw) {
 451         /* Copy charge into send buffer and exchange*/
 452         for(i=0; i<atc->nslab; i++) pme->sidx[i]=pme->sdispls[i];
 453         for(i=0; (i<n); i++) {
 454             ii=pme->sidx[pme->idxa[i]];
 455             pme->sidx[pme->idxa[i]]++;
 456             pme->redist_buf[ii]=charge[i];
 457         }
 458         MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
 459                       atc->q, pme->rcounts, pme->rdispls, mpi_type,
 460                       atc->mpi_comm);
 461     }
 462     else { /* backward, redistribution from pme to pp */
 463         MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
 464                       pme->redist_buf, pme->scounts, pme->sdispls,
 465                       pme->rvec_mpi, atc->mpi_comm);
 466
 467         /* Copy data from receive buffer */
 468         for(i=0; i<atc->nslab; i++)
 469             pme->sidx[i] = pme->sdispls[i];
 470         for(i=0; (i<n); i++) {
 471             ii = DIM*pme->sidx[pme->idxa[i]];
 472             x_f[i][XX] += pme->redist_buf[ii+XX];
 473             x_f[i][YY] += pme->redist_buf[ii+YY];
 474             x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
 475             pme->sidx[pme->idxa[i]]++;
 476         }
 477     }
 478 #endif
 479 }
 480
 481 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
 482                             bool bBackward,int shift,
 483                             void *buf_s,int nbyte_s,
 484                             void *buf_r,int nbyte_r)
 485 {
 486 #ifdef GMX_MPI
 487     int dest,src;
 488     MPI_Status stat;
 489
 490     if (bBackward == FALSE) {
 491         dest = atc->node_dest[shift];
 492         src  = atc->node_src[shift];
 493     } else {
 494         dest = atc->node_src[shift];
 495         src  = atc->node_dest[shift];
 496     }
 497
 498     if (nbyte_s > 0 && nbyte_r > 0) {
 499         MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
 500                      dest,shift,
 501                      buf_r,nbyte_r,MPI_BYTE,
 502                      src,shift,
 503                      atc->mpi_comm,&stat);
 504     } else if (nbyte_s > 0) {
 505         MPI_Send(buf_s,nbyte_s,MPI_BYTE,
 506                  dest,shift,
 507                  atc->mpi_comm);
 508     } else if (nbyte_r > 0) {
 509         MPI_Recv(buf_r,nbyte_r,MPI_BYTE,
 510                  src,shift,
 511                  atc->mpi_comm,&stat);
 512     }
 513 #endif
 514 }
 515
 516 static void dd_pmeredist_x_q(gmx_pme_t pme,
 517                              int n, bool bX, rvec *x, real *charge,
 518                              pme_atomcomm_t *atc)
 519 {
 520     int *commnode,*buf_index;
 521     int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
 522
 523     commnode  = atc->node_dest;
 524     buf_index = atc->buf_index;
 525
 526     nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
 527
 528     nsend = 0;
 529     for(i=0; i<nnodes_comm; i++) {
 530         buf_index[commnode[i]] = nsend;
 531         nsend += atc->count[commnode[i]];
 532     }
 533     if (bX) {
 534         if (atc->count[atc->nodeid] + nsend != n)
 535             gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
 536                       "This usually means that your system is not well equilibrated.",
 537                       n - (atc->count[atc->nodeid] + nsend),
 538                       pme->nodeid,'x'+atc->dimind);
 539
 540         if (nsend > pme->buf_nalloc) {
 541             pme->buf_nalloc = over_alloc_dd(nsend);
 542             srenew(pme->bufv,pme->buf_nalloc);
 543             srenew(pme->bufr,pme->buf_nalloc);
 544         }
 545
 546         atc->n = atc->count[atc->nodeid];
 547         for(i=0; i<nnodes_comm; i++) {
 548             scount = atc->count[commnode[i]];
 549             /* Communicate the count */
 550             if (debug)
 551                 fprintf(debug,"dimind %d PME node %d send to node %d: %d\n",
 552                         atc->dimind,atc->nodeid,commnode[i],scount);
 553             pme_dd_sendrecv(atc,FALSE,i,
 554                             &scount,sizeof(int),
 555                             &atc->rcount[i],sizeof(int));
 556             atc->n += atc->rcount[i];
 557         }
 558
 559         pme_realloc_atomcomm_things(atc);
 560     }
 561
 562     local_pos = 0;
 563     for(i=0; i<n; i++) {
 564         node = atc->pd[i];
 565         if (node == atc->nodeid) {
 566             /* Copy direct to the receive buffer */
 567             if (bX) {
 568                 copy_rvec(x[i],atc->x[local_pos]);
 569             }
 570             atc->q[local_pos] = charge[i];
 571             local_pos++;
 572         } else {
 573             /* Copy to the send buffer */
 574             if (bX) {
 575                 copy_rvec(x[i],pme->bufv[buf_index[node]]);
 576             }
 577             pme->bufr[buf_index[node]] = charge[i];
 578             buf_index[node]++;
 579         }
 580     }
 581
 582     buf_pos = 0;
 583     for(i=0; i<nnodes_comm; i++) {
 584         scount = atc->count[commnode[i]];
 585         rcount = atc->rcount[i];
 586         if (scount > 0 || rcount > 0) {
 587             if (bX) {
 588                 /* Communicate the coordinates */
 589                 pme_dd_sendrecv(atc,FALSE,i,
 590                                 pme->bufv[buf_pos],scount*sizeof(rvec),
 591                                 atc->x[local_pos],rcount*sizeof(rvec));
 592             }
 593             /* Communicate the charges */
 594             pme_dd_sendrecv(atc,FALSE,i,
 595                             pme->bufr+buf_pos,scount*sizeof(real),
 596                             atc->q+local_pos,rcount*sizeof(real));
 597             buf_pos   += scount;
 598             local_pos += atc->rcount[i];
 599         }
 600     }
 601 }
 602
 603 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
 604                            int n, rvec *f,
 605                            bool bAddF)
 606 {
 607   int *commnode,*buf_index;
 608   int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node;
 609
 610   commnode  = atc->node_dest;
 611   buf_index = atc->buf_index;
 612
 613   nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
 614
 615   local_pos = atc->count[atc->nodeid];
 616   buf_pos = 0;
 617   for(i=0; i<nnodes_comm; i++) {
 618     scount = atc->rcount[i];
 619     rcount = atc->count[commnode[i]];
 620     if (scount > 0 || rcount > 0) {
 621       /* Communicate the forces */
 622       pme_dd_sendrecv(atc,TRUE,i,
 623                       atc->f[local_pos],scount*sizeof(rvec),
 624                       pme->bufv[buf_pos],rcount*sizeof(rvec));
 625       local_pos += scount;
 626     }
 627     buf_index[commnode[i]] = buf_pos;
 628     buf_pos   += rcount;
 629   }
 630
 631     local_pos = 0;
 632     if (bAddF)
 633     {
 634         for(i=0; i<n; i++)
 635         {
 636             node = atc->pd[i];
 637             if (node == atc->nodeid)
 638             {
 639                 /* Add from the local force array */
 640                 rvec_inc(f[i],atc->f[local_pos]);
 641                 local_pos++;
 642             }
 643             else
 644             {
 645                 /* Add from the receive buffer */
 646                 rvec_inc(f[i],pme->bufv[buf_index[node]]);
 647                 buf_index[node]++;
 648             }
 649         }
 650     }
 651     else
 652     {
 653         for(i=0; i<n; i++)
 654         {
 655             node = atc->pd[i];
 656             if (node == atc->nodeid)
 657             {
 658                 /* Copy from the local force array */
 659                 copy_rvec(atc->f[local_pos],f[i]);
 660                 local_pos++;
 661             }
 662             else
 663             {
 664                 /* Copy from the receive buffer */
 665                 copy_rvec(pme->bufv[buf_index[node]],f[i]);
 666                 buf_index[node]++;
 667             }
 668         }
 669     }
 670 }
 671
 672 #ifdef GMX_MPI
 673 static void
 674 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 675 {
 676     pme_overlap_t *overlap;
 677     int send_index0,send_nindex;
 678     int recv_index0,recv_nindex;
 679     MPI_Status stat;
 680     int i,j,k,ix,iy,iz,icnt;
 681     int ipulse,send_id,recv_id,datasize;
 682     real *p;
 683     real *sendptr,*recvptr;
 684
 685     /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
 686     overlap = &pme->overlap[1];
 687
 688     for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
 689     {
 690         /* Since we have already (un)wrapped the overlap in the z-dimension,
 691          * we only have to communicate 0 to nkz (not pmegrid_nz).
 692          */
 693         if (direction==GMX_SUM_QGRID_FORWARD)
 694         {
 695             send_id = overlap->send_id[ipulse];
 696             recv_id = overlap->recv_id[ipulse];
 697             send_index0   = overlap->comm_data[ipulse].send_index0;
 698             send_nindex   = overlap->comm_data[ipulse].send_nindex;
 699             recv_index0   = overlap->comm_data[ipulse].recv_index0;
 700             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 701         }
 702         else
 703         {
 704             send_id = overlap->recv_id[ipulse];
 705             recv_id = overlap->send_id[ipulse];
 706             send_index0   = overlap->comm_data[ipulse].recv_index0;
 707             send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 708             recv_index0   = overlap->comm_data[ipulse].send_index0;
 709             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 710         }
 711
 712         /* Copy data to contiguous send buffer */
 713         if (debug)
 714         {
 715             fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
 716                     pme->nodeid,overlap->nodeid,send_id,
 717                     pme->pmegrid_start_iy,
 718                     send_index0-pme->pmegrid_start_iy,
 719                     send_index0-pme->pmegrid_start_iy+send_nindex);
 720         }
 721         icnt = 0;
 722         for(i=0;i<pme->pmegrid_nx;i++)
 723         {
 724             ix = i;
 725             for(j=0;j<send_nindex;j++)
 726             {
 727                 iy = j + send_index0 - pme->pmegrid_start_iy;
 728                 for(k=0;k<pme->nkz;k++)
 729                 {
 730                     iz = k;
 731                     pme->pmegrid_sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
 732                 }
 733             }
 734         }
 735
 736         datasize      = pme->pmegrid_nx * pme->nkz;
 737
 738         MPI_Sendrecv(pme->pmegrid_sendbuf,send_nindex*datasize,GMX_MPI_REAL,
 739                      send_id,ipulse,
 740                      pme->pmegrid_recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
 741                      recv_id,ipulse,
 742                      overlap->mpi_comm,&stat);
 743
 744         /* Get data from contiguous recv buffer */
 745         if (debug)
 746         {
 747             fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
 748                     pme->nodeid,overlap->nodeid,recv_id,
 749                     pme->pmegrid_start_iy,
 750                     recv_index0-pme->pmegrid_start_iy,
 751                     recv_index0-pme->pmegrid_start_iy+recv_nindex);
 752         }
 753         icnt = 0;
 754         for(i=0;i<pme->pmegrid_nx;i++)
 755         {
 756             ix = i;
 757             for(j=0;j<recv_nindex;j++)
 758             {
 759                 iy = j + recv_index0 - pme->pmegrid_start_iy;
 760                 for(k=0;k<pme->nkz;k++)
 761                 {
 762                     iz = k;
 763                     if(direction==GMX_SUM_QGRID_FORWARD)
 764                     {
 765                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += pme->pmegrid_recvbuf[icnt++];
 766                     }
 767                     else
 768                     {
 769                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = pme->pmegrid_recvbuf[icnt++];
 770                     }
 771                 }
 772             }
 773         }
 774     }
 775
 776     /* Major dimension is easier, no copying required,
 777      * but we might have to sum to separate array.
 778      * Since we don't copy, we have to communicate up to pmegrid_nz,
 779      * not nkz as for the minor direction.
 780      */
 781     overlap = &pme->overlap[0];
 782
 783     for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
 784     {
 785         if(direction==GMX_SUM_QGRID_FORWARD)
 786         {
 787             send_id = overlap->send_id[ipulse];
 788             recv_id = overlap->recv_id[ipulse];
 789             send_index0   = overlap->comm_data[ipulse].send_index0;
 790             send_nindex   = overlap->comm_data[ipulse].send_nindex;
 791             recv_index0   = overlap->comm_data[ipulse].recv_index0;
 792             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 793             recvptr   = pme->pmegrid_recvbuf;
 794         }
 795         else
 796         {
 797             send_id = overlap->recv_id[ipulse];
 798             recv_id = overlap->send_id[ipulse];
 799             send_index0   = overlap->comm_data[ipulse].recv_index0;
 800             send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 801             recv_index0   = overlap->comm_data[ipulse].send_index0;
 802             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 803             recvptr   = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 804         }
 805
 806         sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 807         datasize      = pme->pmegrid_ny * pme->pmegrid_nz;
 808
 809         if (debug)
 810         {
 811             fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
 812                     pme->nodeid,overlap->nodeid,send_id,
 813                     pme->pmegrid_start_ix,
 814                     send_index0-pme->pmegrid_start_ix,
 815                     send_index0-pme->pmegrid_start_ix+send_nindex);
 816             fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
 817                     pme->nodeid,overlap->nodeid,recv_id,
 818                     pme->pmegrid_start_ix,
 819                     recv_index0-pme->pmegrid_start_ix,
 820                     recv_index0-pme->pmegrid_start_ix+recv_nindex);
 821         }
 822
 823         MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
 824                      send_id,ipulse,
 825                      recvptr,recv_nindex*datasize,GMX_MPI_REAL,
 826                      recv_id,ipulse,
 827                      overlap->mpi_comm,&stat);
 828
 829         /* ADD data from contiguous recv buffer */
 830         if(direction==GMX_SUM_QGRID_FORWARD)
 831         {
 832             p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 833             for(i=0;i<recv_nindex*datasize;i++)
 834             {
 835                 p[i] += pme->pmegrid_recvbuf[i];
 836             }
 837         }
 838     }
 839 }
 840 #endif
 841
 842
 843 static int
 844 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
 845 {
 846     ivec    local_fft_ndata,local_fft_offset,local_fft_size;
 847     ivec    local_pme_size;
 848     int     i,ix,iy,iz;
 849     int     pmeidx,fftidx;
 850
 851     /* Dimensions should be identical for A/B grid, so we just use A here */
 852     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 853                                    local_fft_ndata,
 854                                    local_fft_offset,
 855                                    local_fft_size);
 856
 857     local_pme_size[0] = pme->pmegrid_nx;
 858     local_pme_size[1] = pme->pmegrid_ny;
 859     local_pme_size[2] = pme->pmegrid_nz;
 860
 861     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 862      the offset is identical, and the PME grid always has more data (due to overlap)
 863      */
 864     {
 865 #ifdef DEBUG_PME
 866         FILE *fp,*fp2;
 867         char fn[STRLEN],format[STRLEN];
 868         real val;
 869         sprintf(fn,"pmegrid%d.pdb",pme->nodeid);
 870         fp = ffopen(fn,"w");
 871         sprintf(fn,"pmegrid%d.txt",pme->nodeid);
 872         fp2 = ffopen(fn,"w");
 873      sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
 874 #endif
 875     for(ix=0;ix<local_fft_ndata[XX];ix++)
 876     {
 877         for(iy=0;iy<local_fft_ndata[YY];iy++)
 878         {
 879             for(iz=0;iz<local_fft_ndata[ZZ];iz++)
 880             {
 881                 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
 882                 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
 883                 fftgrid[fftidx] = pmegrid[pmeidx];
 884 #ifdef DEBUG_PME
 885                 val = 100*pmegrid[pmeidx];
 886                 if (pmegrid[pmeidx] != 0)
 887                 fprintf(fp,format,"ATOM",pmeidx,"CA","GLY",' ',pmeidx,' ',
 888                         5.0*ix,5.0*iy,5.0*iz,1.0,val);
 889                 if (pmegrid[pmeidx] != 0)
 890                     fprintf(fp2,"%-12s  %5d  %5d  %5d  %12.5e\n",
 891                             "qgrid",
 892                             pme->pmegrid_start_ix + ix,
 893                             pme->pmegrid_start_iy + iy,
 894                             pme->pmegrid_start_iz + iz,
 895                             pmegrid[pmeidx]);
 896 #endif
 897             }
 898         }
 899     }
 900 #ifdef DEBUG_PME
 901     fclose(fp);
 902     fclose(fp2);
 903 #endif
 904     }
 905     return 0;
 906 }
 907
 908
 909 static int
 910 copy_fftgrid_to_pmegrid(gmx_pme_t pme, real *fftgrid, real *pmegrid)
 911 {
 912     ivec    local_fft_ndata,local_fft_offset,local_fft_size;
 913     ivec    local_pme_size;
 914     int     i,ix,iy,iz;
 915     int     pmeidx,fftidx;
 916
 917     /* Dimensions should be identical for A/B grid, so we just use A here */
 918     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 919                                    local_fft_ndata,
 920                                    local_fft_offset,
 921                                    local_fft_size);
 922
 923     local_pme_size[0] = pme->pmegrid_nx;
 924     local_pme_size[1] = pme->pmegrid_ny;
 925     local_pme_size[2] = pme->pmegrid_nz;
 926
 927     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 928      the offset is identical, and the PME grid always has more data (due to overlap)
 929      */
 930     for(ix=0;ix<local_fft_ndata[XX];ix++)
 931     {
 932         for(iy=0;iy<local_fft_ndata[YY];iy++)
 933         {
 934             for(iz=0;iz<local_fft_ndata[ZZ];iz++)
 935             {
 936                 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
 937                 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
 938                 pmegrid[pmeidx] = fftgrid[fftidx];
 939             }
 940         }
 941     }
 942     return 0;
 943 }
 944
 945
 946 static void
 947 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 948 {
 949     int     nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
 950
 951     nx = pme->nkx;
 952     ny = pme->nky;
 953     nz = pme->nkz;
 954
 955     pnx = pme->pmegrid_nx;
 956     pny = pme->pmegrid_ny;
 957     pnz = pme->pmegrid_nz;
 958
 959     overlap = pme->pme_order - 1;
 960
 961     /* Add periodic overlap in z */
 962     for(ix=0; ix<pnx; ix++)
 963     {
 964         for(iy=0; iy<pny; iy++)
 965         {
 966             for(iz=0; iz<overlap; iz++)
 967             {
 968                 pmegrid[(ix*pny+iy)*pnz+iz] +=
 969                     pmegrid[(ix*pny+iy)*pnz+nz+iz];
 970             }
 971         }
 972     }
 973
 974     if (pme->nnodes_minor == 1)
 975     {
 976        for(ix=0; ix<pnx; ix++)
 977        {
 978            for(iy=0; iy<overlap; iy++)
 979            {
 980                for(iz=0; iz<nz; iz++)
 981                {
 982                    pmegrid[(ix*pny+iy)*pnz+iz] +=
 983                        pmegrid[(ix*pny+ny+iy)*pnz+iz];
 984                }
 985            }
 986        }
 987     }
 988
 989     if (pme->nnodes_major == 1)
 990     {
 991         ny_x = (pme->nnodes_minor == 1 ? ny : pny);
 992
 993         for(ix=0; ix<overlap; ix++)
 994         {
 995             for(iy=0; iy<ny_x; iy++)
 996             {
 997                 for(iz=0; iz<nz; iz++)
 998                 {
 999                     pmegrid[(ix*pny+iy)*pnz+iz] +=
1000                         pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1001                 }
1002             }
1003         }
1004     }
1005 }
1006
1007
1008 static void
1009 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1010 {
1011     int     nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
1012
1013     nx = pme->nkx;
1014     ny = pme->nky;
1015     nz = pme->nkz;
1016
1017     pnx = pme->pmegrid_nx;
1018     pny = pme->pmegrid_ny;
1019     pnz = pme->pmegrid_nz;
1020
1021     overlap = pme->pme_order - 1;
1022
1023     if (pme->nnodes_major == 1)
1024     {
1025         ny_x = (pme->nnodes_minor == 1 ? ny : pny);
1026
1027         for(ix=0; ix<overlap; ix++)
1028         {
1029             for(iy=0; iy<ny_x; iy++)
1030             {
1031                 for(iz=0; iz<nz; iz++)
1032                 {
1033                     pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1034                         pmegrid[(ix*pny+iy)*pnz+iz];
1035                 }
1036             }
1037         }
1038     }
1039
1040     if (pme->nnodes_minor == 1)
1041     {
1042        for(ix=0; ix<pnx; ix++)
1043        {
1044            for(iy=0; iy<overlap; iy++)
1045            {
1046                for(iz=0; iz<nz; iz++)
1047                {
1048                    pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1049                        pmegrid[(ix*pny+iy)*pnz+iz];
1050                }
1051            }
1052        }
1053     }
1054
1055     /* Copy periodic overlap in z */
1056     for(ix=0; ix<pnx; ix++)
1057     {
1058         for(iy=0; iy<pny; iy++)
1059         {
1060             for(iz=0; iz<overlap; iz++)
1061             {
1062                 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1063                     pmegrid[(ix*pny+iy)*pnz+iz];
1064             }
1065         }
1066     }
1067 }
1068
1069
1070 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1071 #define DO_BSPLINE(order)                            \
1072 for(ithx=0; (ithx<order); ithx++)                    \
1073 {                                                    \
1074     index_x = (i0+ithx)*pny*pnz;                     \
1075     valx    = qn*thx[ithx];                          \
1076                                                      \
1077     for(ithy=0; (ithy<order); ithy++)                \
1078     {                                                \
1079         valxy    = valx*thy[ithy];                   \
1080         index_xy = index_x+(j0+ithy)*pnz;            \
1081                                                      \
1082         for(ithz=0; (ithz<order); ithz++)            \
1083         {                                            \
1084             index_xyz        = index_xy+(k0+ithz);   \
1085             grid[index_xyz] += valxy*thz[ithz];      \
1086         }                                            \
1087     }                                                \
1088 }
1089
1090
1091 static void spread_q_bsplines(gmx_pme_t pme, pme_atomcomm_t *atc,
1092                               real *grid)
1093 {
1094
1095     /* spread charges from home atoms to local grid */
1096     pme_overlap_t *ol;
1097     int      b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
1098     int *    idxptr;
1099     int      order,norder,index_x,index_xy,index_xyz;
1100     real     valx,valxy,qn;
1101     real     *thx,*thy,*thz;
1102     int      localsize, bndsize;
1103
1104     int      pnx,pny,pnz,ndatatot;
1105
1106     pnx = pme->pmegrid_nx;
1107     pny = pme->pmegrid_ny;
1108     pnz = pme->pmegrid_nz;
1109     ndatatot = pnx*pny*pnz;
1110
1111     for(i=0;i<ndatatot;i++)
1112     {
1113         grid[i] = 0;
1114     }
1115
1116     order = pme->pme_order;
1117
1118     for(nn=0; (nn<atc->n);nn++)
1119     {
1120         n      = nn;
1121         qn     = atc->q[n];
1122
1123         if (qn != 0)
1124         {
1125             idxptr = atc->idx[n];
1126             norder = n*order;
1127
1128             i0   = idxptr[XX];
1129             j0   = idxptr[YY];
1130             k0   = idxptr[ZZ];
1131             thx = atc->theta[XX] + norder;
1132             thy = atc->theta[YY] + norder;
1133             thz = atc->theta[ZZ] + norder;
1134
1135             switch (order) {
1136             case 4:  DO_BSPLINE(4);     break;
1137             case 5:  DO_BSPLINE(5);     break;
1138             default: DO_BSPLINE(order); break;
1139             }
1140         }
1141     }
1142 }
1143
1144
1145 #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
1146     /* Calculate exponentials through SSE in float precision */
1147 #define CALC_EXPONENTIALS(start,end,r_aligned)      \
1148     {                                               \
1149         __m128 tmp_sse;                             \
1150         for(kx=0; kx<end; kx+=4)                    \
1151         {                                           \
1152             tmp_sse = _mm_load_ps(r_aligned+kx);    \
1153             tmp_sse = gmx_mm_exp_ps_lbc(tmp_sse);   \
1154             _mm_store_ps(r_aligned+kx,tmp_sse);     \
1155         }                                           \
1156     }
1157 #else
1158 #define CALC_EXPONENTIALS(start,end,r)          \
1159     for(kx=start; kx<end; kx++)                 \
1160     {                                           \
1161         r[kx] = exp(r[kx]);                     \
1162     }
1163 #endif
1164
1165
1166 static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
1167                          real ewaldcoeff,real vol,
1168                          bool bEnerVir,real *mesh_energy,matrix vir)
1169 {
1170     /* do recip sum over local cells in grid */
1171     /* y major, z middle, x minor or continuous */
1172     t_complex *p0;
1173     int     kx,ky,kz,maxkx,maxky,maxkz;
1174     int     nx,ny,nz,iy,iz,kxstart,kxend;
1175     real    mx,my,mz;
1176     real    factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1177     real    ets2,struct2,vfactor,ets2vf;
1178     real    eterm,d1,d2,energy=0;
1179     real    by,bz;
1180     real    virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
1181     real    rxx,ryx,ryy,rzx,rzy,rzz;
1182         real    *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*m2inv;
1183     real    mhxk,mhyk,mhzk,m2k;
1184     real    corner_fac;
1185     ivec    complex_order;
1186     ivec    local_ndata,local_offset,local_size;
1187
1188     nx = pme->nkx;
1189     ny = pme->nky;
1190     nz = pme->nkz;
1191
1192     /* Dimensions should be identical for A/B grid, so we just use A here */
1193     gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1194                                       complex_order,
1195                                       local_ndata,
1196                                       local_offset,
1197                                       local_size);
1198
1199     rxx = pme->recipbox[XX][XX];
1200     ryx = pme->recipbox[YY][XX];
1201     ryy = pme->recipbox[YY][YY];
1202     rzx = pme->recipbox[ZZ][XX];
1203     rzy = pme->recipbox[ZZ][YY];
1204     rzz = pme->recipbox[ZZ][ZZ];
1205
1206     maxkx = (nx+1)/2;
1207     maxky = (ny+1)/2;
1208     maxkz = nz/2+1;
1209
1210         mhx   = pme->work_mhx;
1211         mhy   = pme->work_mhy;
1212         mhz   = pme->work_mhz;
1213         m2    = pme->work_m2;
1214         denom = pme->work_denom;
1215         tmp1  = pme->work_tmp1;
1216         m2inv = pme->work_m2inv;
1217
1218     for(iy=0;iy<local_ndata[YY];iy++)
1219     {
1220         ky = iy + local_offset[YY];
1221
1222         if (ky < maxky)
1223         {
1224             my = ky;
1225         }
1226         else
1227         {
1228             my = (ky - ny);
1229         }
1230
1231         by = M_PI*vol*pme->bsp_mod[YY][ky];
1232
1233         for(iz=0;iz<local_ndata[ZZ];iz++)
1234         {
1235             kz = iz + local_offset[ZZ];
1236
1237             mz = kz;
1238
1239             bz = pme->bsp_mod[ZZ][kz];
1240
1241             /* 0.5 correction for corner points */
1242                         corner_fac = 1;
1243             if (kz == 0)
1244                 corner_fac = 0.5;
1245             if (kz == (nz+1)/2)
1246                 corner_fac = 0.5;
1247
1248             p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
1249
1250             /* We should skip the k-space point (0,0,0) */
1251             if (local_offset[XX] > 0 ||
1252                 local_offset[YY] > 0 || ky > 0 ||
1253                 kz > 0)
1254             {
1255                 kxstart = local_offset[XX];
1256             }
1257             else
1258             {
1259                 kxstart = local_offset[XX] + 1;
1260                 p0++;
1261             }
1262             kxend = local_offset[XX] + local_ndata[XX];
1263
1264             if (bEnerVir)
1265             {
1266                 /* More expensive inner loop, especially because of the storage
1267                  * of the mh elements in array's.
1268                  * Because x is the minor grid index, all mh elements
1269                  * depend on kx for triclinic unit cells.
1270                  */
1271
1272                 /* Two explicit loops to avoid a conditional inside the loop */
1273                 for(kx=kxstart; kx<maxkx; kx++)
1274                 {
1275                     mx = kx;
1276
1277                     mhxk      = mx * rxx;
1278                     mhyk      = mx * ryx + my * ryy;
1279                     mhzk      = mx * rzx + my * rzy + mz * rzz;
1280                     m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1281                     mhx[kx]   = mhxk;
1282                     mhy[kx]   = mhyk;
1283                     mhz[kx]   = mhzk;
1284                     m2[kx]    = m2k;
1285                     denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1286                     tmp1[kx]  = -factor*m2k;
1287                 }
1288
1289                 for(kx=maxkx; kx<kxend; kx++)
1290                 {
1291                     mx = (kx - nx);
1292
1293                     mhxk      = mx * rxx;
1294                     mhyk      = mx * ryx + my * ryy;
1295                     mhzk      = mx * rzx + my * rzy + mz * rzz;
1296                     m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1297                     mhx[kx]   = mhxk;
1298                     mhy[kx]   = mhyk;
1299                     mhz[kx]   = mhzk;
1300                     m2[kx]    = m2k;
1301                     denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1302                     tmp1[kx]  = -factor*m2k;
1303                 }
1304
1305                 for(kx=kxstart; kx<kxend; kx++)
1306                 {
1307                     m2inv[kx] = 1.0/m2[kx];
1308                 }
1309                 for(kx=kxstart; kx<kxend; kx++)
1310                 {
1311                     denom[kx] = 1.0/denom[kx];
1312                 }
1313
1314                 CALC_EXPONENTIALS(kxstart,kxend,tmp1);
1315
1316                 for(kx=kxstart; kx<kxend; kx++,p0++)
1317                 {
1318                     d1      = p0->re;
1319                     d2      = p0->im;
1320
1321                     eterm    = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
1322
1323                     p0->re  = d1*eterm;
1324                     p0->im  = d2*eterm;
1325
1326                     struct2 = 2.0*(d1*d1+d2*d2);
1327
1328                     tmp1[kx] = eterm*struct2;
1329                 }
1330
1331                 for(kx=kxstart; kx<kxend; kx++)
1332                 {
1333                     ets2     = corner_fac*tmp1[kx];
1334                     vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
1335                     energy  += ets2;
1336
1337                     ets2vf   = ets2*vfactor;
1338                     virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
1339                     virxy   += ets2vf*mhx[kx]*mhy[kx];
1340                     virxz   += ets2vf*mhx[kx]*mhz[kx];
1341                     viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
1342                     viryz   += ets2vf*mhy[kx]*mhz[kx];
1343                     virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
1344                 }
1345             }
1346             else
1347             {
1348                 /* We don't need to calculate the energy and the virial.
1349                  * In this case the triclinic overhead is small.
1350                  */
1351
1352                 /* Two explicit loops to avoid a conditional inside the loop */
1353
1354                 for(kx=kxstart; kx<maxkx; kx++)
1355                 {
1356                     mx = kx;
1357
1358                     mhxk      = mx * rxx;
1359                     mhyk      = mx * ryx + my * ryy;
1360                     mhzk      = mx * rzx + my * rzy + mz * rzz;
1361                     m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1362                     denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1363                     tmp1[kx]  = -factor*m2k;
1364                 }
1365
1366                 for(kx=maxkx; kx<kxend; kx++)
1367                 {
1368                     mx = (kx - nx);
1369
1370                     mhxk      = mx * rxx;
1371                     mhyk      = mx * ryx + my * ryy;
1372                     mhzk      = mx * rzx + my * rzy + mz * rzz;
1373                     m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1374                     denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1375                     tmp1[kx]  = -factor*m2k;
1376                 }
1377
1378                 for(kx=kxstart; kx<kxend; kx++)
1379                 {
1380                     denom[kx] = 1.0/denom[kx];
1381                 }
1382
1383                 CALC_EXPONENTIALS(kxstart,kxend,tmp1);
1384
1385                 for(kx=kxstart; kx<kxend; kx++,p0++)
1386                 {
1387                     d1      = p0->re;
1388                     d2      = p0->im;
1389
1390                     eterm    = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
1391
1392                     p0->re  = d1*eterm;
1393                     p0->im  = d2*eterm;
1394                 }
1395             }
1396         }
1397     }
1398
1399     if (bEnerVir)
1400     {
1401         /* Update virial with local values.
1402          * The virial is symmetric by definition.
1403          * this virial seems ok for isotropic scaling, but I'm
1404          * experiencing problems on semiisotropic membranes.
1405          * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
1406          */
1407         vir[XX][XX] = 0.25*virxx;
1408         vir[YY][YY] = 0.25*viryy;
1409         vir[ZZ][ZZ] = 0.25*virzz;
1410         vir[XX][YY] = vir[YY][XX] = 0.25*virxy;
1411         vir[XX][ZZ] = vir[ZZ][XX] = 0.25*virxz;
1412         vir[YY][ZZ] = vir[ZZ][YY] = 0.25*viryz;
1413
1414         /* This energy should be corrected for a charged system */
1415         *mesh_energy = 0.5*energy;
1416     }
1417
1418     /* Return the loop count */
1419     return local_ndata[YY]*local_ndata[ZZ]*local_ndata[XX];
1420 }
1421
1422
1423 #define DO_FSPLINE(order)                      \
1424 for(ithx=0; (ithx<order); ithx++)              \
1425 {                                                                          \
1426     index_x = (i0+ithx)*pny*pnz;               \
1427     tx      = thx[ithx];                       \
1428     dx      = dthx[ithx];                      \
1429                                                \
1430     for(ithy=0; (ithy<order); ithy++)          \
1431     {                                                                              \
1432         index_xy = index_x+(j0+ithy)*pnz;      \
1433         ty       = thy[ithy];                  \
1434         dy       = dthy[ithy];                 \
1435         fxy1     = fz1 = 0;                    \
1436                                                \
1437         for(ithz=0; (ithz<order); ithz++)      \
1438         {                                                                  \
1439             gval  = grid[index_xy+(k0+ithz)];  \
1440             fxy1 += thz[ithz]*gval;            \
1441             fz1  += dthz[ithz]*gval;           \
1442         }                                      \
1443         fx += dx*ty*fxy1;                      \
1444         fy += tx*dy*fxy1;                      \
1445         fz += tx*ty*fz1;                       \
1446     }                                          \
1447 }
1448
1449
1450 void gather_f_bsplines(gmx_pme_t pme,real *grid,
1451                        bool bClearF,pme_atomcomm_t *atc,real scale)
1452 {
1453     /* sum forces for local particles */
1454     int     nn,n,ithx,ithy,ithz,i0,j0,k0;
1455     int     index_x,index_xy;
1456     int     nx,ny,nz,pnx,pny,pnz;
1457     int *   idxptr;
1458     real    tx,ty,dx,dy,qn;
1459     real    fx,fy,fz,gval;
1460     real    fxy1,fz1;
1461     real    *thx,*thy,*thz,*dthx,*dthy,*dthz;
1462     int     norder;
1463     real    rxx,ryx,ryy,rzx,rzy,rzz;
1464     int     order;
1465
1466     order = pme->pme_order;
1467     thx   = atc->theta[XX];
1468     thy   = atc->theta[YY];
1469     thz   = atc->theta[ZZ];
1470     dthx  = atc->dtheta[XX];
1471     dthy  = atc->dtheta[YY];
1472     dthz  = atc->dtheta[ZZ];
1473     nx    = pme->nkx;
1474     ny    = pme->nky;
1475     nz    = pme->nkz;
1476     pnx   = pme->pmegrid_nx;
1477     pny   = pme->pmegrid_ny;
1478     pnz   = pme->pmegrid_nz;
1479
1480     rxx   = pme->recipbox[XX][XX];
1481     ryx   = pme->recipbox[YY][XX];
1482     ryy   = pme->recipbox[YY][YY];
1483     rzx   = pme->recipbox[ZZ][XX];
1484     rzy   = pme->recipbox[ZZ][YY];
1485     rzz   = pme->recipbox[ZZ][ZZ];
1486
1487     for(nn=0; (nn<atc->n); nn++)
1488     {
1489         n = nn;
1490         qn      = scale*atc->q[n];
1491
1492         if (bClearF)
1493         {
1494             atc->f[n][XX] = 0;
1495             atc->f[n][YY] = 0;
1496             atc->f[n][ZZ] = 0;
1497         }
1498         if (qn != 0)
1499         {
1500             fx     = 0;
1501             fy     = 0;
1502             fz     = 0;
1503             idxptr = atc->idx[n];
1504             norder = n*order;
1505
1506             i0   = idxptr[XX];
1507             j0   = idxptr[YY];
1508             k0   = idxptr[ZZ];
1509
1510             /* Pointer arithmetic alert, next six statements */
1511             thx  = atc->theta[XX] + norder;
1512             thy  = atc->theta[YY] + norder;
1513             thz  = atc->theta[ZZ] + norder;
1514             dthx = atc->dtheta[XX] + norder;
1515             dthy = atc->dtheta[YY] + norder;
1516             dthz = atc->dtheta[ZZ] + norder;
1517
1518             switch (order) {
1519             case 4:  DO_FSPLINE(4);     break;
1520             case 5:  DO_FSPLINE(5);     break;
1521             default: DO_FSPLINE(order); break;
1522             }
1523
1524             atc->f[n][XX] += -qn*( fx*nx*rxx );
1525             atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
1526             atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
1527         }
1528     }
1529     /* Since the energy and not forces are interpolated
1530      * the net force might not be exactly zero.
1531      * This can be solved by also interpolating F, but
1532      * that comes at a cost.
1533      * A better hack is to remove the net force every
1534      * step, but that must be done at a higher level
1535      * since this routine doesn't see all atoms if running
1536      * in parallel. Don't know how important it is?  EL 990726
1537      */
1538 }
1539
1540 static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
1541                                    pme_atomcomm_t *atc)
1542 {
1543     int     n,ithx,ithy,ithz,i0,j0,k0;
1544     int     index_x,index_xy;
1545     int *   idxptr;
1546     real    energy,pot,tx,ty,qn,gval;
1547     real    *thx,*thy,*thz;
1548     int     norder;
1549     int     order;
1550
1551
1552     order = pme->pme_order;
1553     thx   = atc->theta[XX];
1554     thy   = atc->theta[YY];
1555     thz   = atc->theta[ZZ];
1556
1557     energy = 0;
1558     for(n=0; (n<atc->n); n++) {
1559         qn      = atc->q[n];
1560
1561         if (qn != 0) {
1562             idxptr = atc->idx[n];
1563             norder = n*order;
1564
1565             i0   = idxptr[XX];
1566             j0   = idxptr[YY];
1567             k0   = idxptr[ZZ];
1568
1569             /* Pointer arithmetic alert, next three statements */
1570             thx  = atc->theta[XX] + norder;
1571             thy  = atc->theta[YY] + norder;
1572             thz  = atc->theta[ZZ] + norder;
1573
1574             pot = 0;
1575             for(ithx=0; (ithx<order); ithx++)
1576             {
1577                 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
1578                 tx      = thx[ithx];
1579
1580                 for(ithy=0; (ithy<order); ithy++)
1581                 {
1582                     index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
1583                     ty       = thy[ithy];
1584
1585                     for(ithz=0; (ithz<order); ithz++)
1586                     {
1587                         gval  = grid[index_xy+(k0+ithz)];
1588                         pot  += tx*ty*thz[ithz]*gval;
1589                     }
1590
1591                 }
1592             }
1593
1594             energy += pot*qn;
1595         }
1596     }
1597
1598     return energy;
1599 }
1600
1601 void make_bsplines(splinevec theta,splinevec dtheta,int order,
1602                    rvec fractx[],int nr,real charge[],
1603                    bool bFreeEnergy)
1604 {
1605     /* construct splines for local atoms */
1606     int  i,j,k,l;
1607     real dr,div;
1608     real *data,*ddata,*xptr;
1609
1610     for(i=0; (i<nr); i++) {
1611         /* With free energy we do not use the charge check.
1612          * In most cases this will be more efficient than calling make_bsplines
1613          * twice, since usually more than half the particles have charges.
1614          */
1615         if (bFreeEnergy || charge[i] != 0.0) {
1616             xptr = fractx[i];
1617             for(j=0; (j<DIM); j++) {
1618                 dr  = xptr[j];
1619
1620                 /* dr is relative offset from lower cell limit */
1621                 data=&(theta[j][i*order]);
1622                 data[order-1]=0;
1623                 data[1]=dr;
1624                 data[0]=1-dr;
1625
1626                 for(k=3; (k<order); k++) {
1627                     div=1.0/(k-1.0);
1628                     data[k-1]=div*dr*data[k-2];
1629                     for(l=1; (l<(k-1)); l++) {
1630                         data[k-l-1]=div*((dr+l)*data[k-l-2]+(k-l-dr)*
1631                                          data[k-l-1]);
1632                     }
1633                     data[0]=div*(1-dr)*data[0];
1634                 }
1635                 /* differentiate */
1636                 ddata    = &(dtheta[j][i*order]);
1637                 ddata[0] = -data[0];
1638                 for(k=1; (k<order); k++) {
1639                     ddata[k]=data[k-1]-data[k];
1640                 }
1641
1642                 div=1.0/(order-1);
1643                 data[order-1]=div*dr*data[order-2];
1644                 for(l=1; (l<(order-1)); l++) {
1645                     data[order-l-1]=div*((dr+l)*data[order-l-2]+
1646                                          (order-l-dr)*data[order-l-1]);
1647                 }
1648                 data[0]=div*(1-dr)*data[0];
1649             }
1650         }
1651     }
1652 }
1653
1654
1655 void make_dft_mod(real *mod,real *data,int ndata)
1656 {
1657   int i,j;
1658   real sc,ss,arg;
1659
1660   for(i=0;i<ndata;i++) {
1661     sc=ss=0;
1662     for(j=0;j<ndata;j++) {
1663       arg=(2.0*M_PI*i*j)/ndata;
1664       sc+=data[j]*cos(arg);
1665       ss+=data[j]*sin(arg);
1666     }
1667     mod[i]=sc*sc+ss*ss;
1668   }
1669   for(i=0;i<ndata;i++)
1670     if(mod[i]<1e-7)
1671       mod[i]=(mod[i-1]+mod[i+1])*0.5;
1672 }
1673
1674
1675
1676 void make_bspline_moduli(splinevec bsp_mod,int nx,int ny,int nz,int order)
1677 {
1678   int nmax=max(nx,max(ny,nz));
1679   real *data,*ddata,*bsp_data;
1680   int i,k,l;
1681   real div;
1682
1683   snew(data,order);
1684   snew(ddata,order);
1685   snew(bsp_data,nmax);
1686
1687   data[order-1]=0;
1688   data[1]=0;
1689   data[0]=1;
1690
1691   for(k=3;k<order;k++) {
1692     div=1.0/(k-1.0);
1693     data[k-1]=0;
1694     for(l=1;l<(k-1);l++)
1695       data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
1696     data[0]=div*data[0];
1697   }
1698   /* differentiate */
1699   ddata[0]=-data[0];
1700   for(k=1;k<order;k++)
1701     ddata[k]=data[k-1]-data[k];
1702   div=1.0/(order-1);
1703   data[order-1]=0;
1704   for(l=1;l<(order-1);l++)
1705     data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
1706   data[0]=div*data[0];
1707
1708   for(i=0;i<nmax;i++)
1709     bsp_data[i]=0;
1710   for(i=1;i<=order;i++)
1711     bsp_data[i]=data[i-1];
1712
1713   make_dft_mod(bsp_mod[XX],bsp_data,nx);
1714   make_dft_mod(bsp_mod[YY],bsp_data,ny);
1715   make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
1716
1717   sfree(data);
1718   sfree(ddata);
1719   sfree(bsp_data);
1720 }
1721
1722 static void setup_coordinate_communication(pme_atomcomm_t *atc)
1723 {
1724   int nslab,n,i;
1725   int fw,bw;
1726
1727   nslab = atc->nslab;
1728
1729   n = 0;
1730   for(i=1; i<=nslab/2; i++) {
1731     fw = (atc->nodeid + i) % nslab;
1732     bw = (atc->nodeid - i + nslab) % nslab;
1733     if (n < nslab - 1) {
1734       atc->node_dest[n] = fw;
1735       atc->node_src[n]  = bw;
1736       n++;
1737     }
1738     if (n < nslab - 1) {
1739       atc->node_dest[n] = bw;
1740       atc->node_src[n]  = fw;
1741       n++;
1742     }
1743   }
1744 }
1745
1746 int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
1747 {
1748     if(NULL != log)
1749     {
1750         fprintf(log,"Destroying PME data structures.\n");
1751     }
1752
1753     sfree((*pmedata)->nnx);
1754     sfree((*pmedata)->nny);
1755     sfree((*pmedata)->nnz);
1756
1757     sfree((*pmedata)->pmegridA);
1758     sfree((*pmedata)->fftgridA);
1759     sfree((*pmedata)->cfftgridA);
1760     gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
1761
1762     if((*pmedata)->pmegridB)
1763     {
1764         sfree((*pmedata)->pmegridB);
1765         sfree((*pmedata)->fftgridB);
1766         sfree((*pmedata)->cfftgridB);
1767         gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
1768     }
1769     sfree((*pmedata)->work_mhz);
1770     sfree((*pmedata)->work_m2);
1771     sfree((*pmedata)->work_denom);
1772     sfree((*pmedata)->work_tmp1_alloc);
1773     sfree((*pmedata)->work_m2inv);
1774
1775     sfree(*pmedata);
1776     *pmedata = NULL;
1777
1778   return 0;
1779 }
1780
1781 static int mult_up(int n,int f)
1782 {
1783     return ((n + f - 1)/f)*f;
1784 }
1785
1786
1787 static double pme_load_imbalance(gmx_pme_t pme)
1788 {
1789     int    nma,nmi;
1790     double n1,n2,n3;
1791
1792     nma = pme->nnodes_major;
1793     nmi = pme->nnodes_minor;
1794
1795     n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz;
1796     n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky;
1797     n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx;
1798
1799     /* pme_solve is roughly double the cost of an fft */
1800
1801     return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
1802 }
1803
1804 static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
1805                           int dimind,bool bSpread)
1806 {
1807     int nk,k,s;
1808
1809     atc->dimind = dimind;
1810     atc->nslab  = 1;
1811     atc->nodeid = 0;
1812     atc->pd_nalloc = 0;
1813 #ifdef GMX_MPI
1814     if (pme->nnodes > 1)
1815     {
1816         atc->mpi_comm = pme->mpi_comm_d[dimind];
1817         MPI_Comm_size(atc->mpi_comm,&atc->nslab);
1818         MPI_Comm_rank(atc->mpi_comm,&atc->nodeid);
1819     }
1820     if (debug)
1821     {
1822         fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid);
1823     }
1824 #endif
1825
1826     atc->bSpread   = bSpread;
1827     atc->pme_order = pme->pme_order;
1828
1829     if (atc->nslab > 1)
1830     {
1831         /* These three allocations are not required for particle decomp. */
1832         snew(atc->node_dest,atc->nslab);
1833         snew(atc->node_src,atc->nslab);
1834         setup_coordinate_communication(atc);
1835
1836         snew(atc->count,atc->nslab);
1837         snew(atc->rcount,atc->nslab);
1838         snew(atc->buf_index,atc->nslab);
1839     }
1840 }
1841
1842 static void
1843 init_overlap_comm(pme_overlap_t *  ol,
1844                   int              norder,
1845 #ifdef GMX_MPI
1846                   MPI_Comm         comm,
1847 #endif
1848                   int              nnodes,
1849                   int              nodeid,
1850                   int              ndata)
1851 {
1852     int lbnd,rbnd,maxlr,b,i;
1853     int exten;
1854     int nn,nk;
1855     pme_grid_comm_t *pgc;
1856     bool bCont;
1857     int fft_start,fft_end,send_index1,recv_index1;
1858
1859 #ifdef GMX_MPI
1860     ol->mpi_comm = comm;
1861 #endif
1862
1863     ol->nnodes = nnodes;
1864     ol->nodeid = nodeid;
1865
1866     ol->ndata  = ndata;
1867
1868     /* Linear translation of the PME grid wo'nt affect reciprocal space
1869      * calculations, so to optimize we only interpolate "upwards",
1870      * which also means we only have to consider overlap in one direction.
1871      * I.e., particles on this node might also be spread to grid indices
1872      * that belong to higher nodes (modulo nnodes)
1873      */
1874
1875     snew(ol->s2g0,ol->nnodes+1);
1876     snew(ol->s2g1,ol->nnodes);
1877     if (debug) { fprintf(debug,"PME slab boundaries:"); }
1878     for(i=0; i<nnodes; i++)
1879     {
1880         /* s2g0 the local interpolation grid start.
1881          * s2g1 the local interpolation grid end.
1882          * Because grid overlap communication only goes forward,
1883          * the grid the slabs for fft's should be rounded down.
1884          */
1885         ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
1886         ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
1887
1888         if (debug)
1889         {
1890             fprintf(debug,"  %3d %3d",ol->s2g0[i],ol->s2g1[i]);
1891         }
1892     }
1893     ol->s2g0[nnodes] = ndata;
1894     if (debug) { fprintf(debug,"\n"); }
1895
1896     /* Determine with how many nodes we need to communicate the grid overlap */
1897     b = 0;
1898     do
1899     {
1900         b++;
1901         bCont = FALSE;
1902         for(i=0; i<nnodes; i++)
1903         {
1904             if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
1905                 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
1906             {
1907                 bCont = TRUE;
1908             }
1909         }
1910     }
1911     while (bCont && b < nnodes);
1912     ol->noverlap_nodes = b - 1;
1913
1914     snew(ol->send_id,ol->noverlap_nodes);
1915     snew(ol->recv_id,ol->noverlap_nodes);
1916     for(b=0; b<ol->noverlap_nodes; b++)
1917     {
1918         ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
1919         ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
1920     }
1921     snew(ol->comm_data, ol->noverlap_nodes);
1922
1923     for(b=0; b<ol->noverlap_nodes; b++)
1924     {
1925         pgc = &ol->comm_data[b];
1926         /* Send */
1927         fft_start        = ol->s2g0[ol->send_id[b]];
1928         fft_end          = ol->s2g0[ol->send_id[b]+1];
1929         if (ol->send_id[b] < nodeid)
1930         {
1931             fft_start += ol->ndata;
1932             fft_end  += ol->ndata;
1933         }
1934         send_index1      = ol->s2g1[nodeid];
1935         send_index1      = min(send_index1,fft_end);
1936         pgc->send_index0 = fft_start;
1937         pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
1938
1939         /* We always start receiving to the first index of our slab */
1940         fft_start        = ol->s2g0[ol->nodeid];
1941         fft_end          = ol->s2g0[ol->nodeid+1];
1942         recv_index1      = ol->s2g1[ol->recv_id[b]];
1943         if (ol->recv_id[b] > nodeid)
1944         {
1945             recv_index1 -= ol->ndata;
1946         }
1947         recv_index1      = min(recv_index1,fft_end);
1948         pgc->recv_index0 = fft_start;
1949         pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
1950     }
1951 }
1952
1953 static void
1954 make_gridindex5_to_localindex(int n,int local_start,int local_end,
1955                               int **global_to_local,
1956                               real **fraction_shift)
1957 {
1958     int local_size,i;
1959     int * gtl;
1960     real * fsh;
1961
1962     local_size = local_end - local_start;
1963
1964     snew(gtl,5*n);
1965     snew(fsh,5*n);
1966     for(i=0; (i<5*n); i++) {
1967         /* Determine the global to local grid index */
1968         gtl[i] = (i - local_start + n) % n;
1969         /* For coordinates that fall within the local grid the fraction
1970          * is correct, we don't need to shift it.
1971          */
1972         fsh[i] = 0;
1973         if (local_size < n)
1974         {
1975             /* Due to rounding issues i could be 1 beyond the lower or
1976              * upper boundary of the local grid. Correct the index for this.
1977              * If we shift the index, we need to shift the fraction by
1978              * the same amount in the other direction to not affect
1979              * the weights.
1980              * Note that due to this shifting the weights at the end of
1981              * the spline might change, but that will only involve values
1982              * between zero and values close to the precision of a real,
1983              * which is anyhow the accuracy of the whole mesh calculation.
1984              */
1985             /* With local_size=0 we should not change i=local_start */
1986             if (i % n != local_start)
1987             {
1988                 if (gtl[i] == n-1)
1989                 {
1990                     gtl[i] = 0;
1991                     fsh[i] = -1;
1992                 }
1993                 else if (gtl[i] == local_size)
1994                 {
1995                     gtl[i] = local_size - 1;
1996                     fsh[i] = 1;
1997                 }
1998             }
1999         }
2000     }
2001
2002     *global_to_local = gtl;
2003     *fraction_shift  = fsh;
2004 }
2005
2006 static void
2007 gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
2008 {
2009     int nk_new;
2010
2011     if (*nk % nnodes != 0)
2012     {
2013         nk_new = nnodes*(*nk/nnodes + 1);
2014
2015         if (2*nk_new >= 3*(*nk))
2016         {
2017             gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
2018                       dim,*nk,dim,nnodes,dim);
2019         }
2020
2021         if (fplog != NULL)
2022         {
2023             fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
2024                     dim,*nk,dim,nnodes,dim,nk_new,dim);
2025         }
2026
2027         *nk = nk_new;
2028     }
2029 }
2030
2031 int gmx_pme_init(gmx_pme_t *         pmedata,
2032                  t_commrec *         cr,
2033                  int                 nnodes_major,
2034                  int                 nnodes_minor,
2035                  t_inputrec *        ir,
2036                  int                 homenr,
2037                  bool                bFreeEnergy,
2038                  bool                bReproducible)
2039 {
2040     gmx_pme_t pme=NULL;
2041
2042     pme_atomcomm_t *atc;
2043     int bufsizex,bufsizey,bufsize;
2044     ivec ndata;
2045
2046     if (debug)
2047         fprintf(debug,"Creating PME data structures.\n");
2048     snew(pme,1);
2049
2050     pme->redist_init         = FALSE;
2051     pme->sum_qgrid_tmp       = NULL;
2052     pme->sum_qgrid_dd_tmp    = NULL;
2053     pme->buf_nalloc          = 0;
2054     pme->redist_buf_nalloc   = 0;
2055
2056     pme->nnodes              = 1;
2057     pme->bPPnode             = TRUE;
2058
2059     pme->nnodes_major        = nnodes_major;
2060     pme->nnodes_minor        = nnodes_minor;
2061
2062 #ifdef GMX_MPI
2063     if (PAR(cr))
2064     {
2065         pme->mpi_comm        = cr->mpi_comm_mygroup;
2066
2067         MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
2068         MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
2069     }
2070 #endif
2071
2072     if (pme->nnodes == 1)
2073     {
2074         pme->ndecompdim = 0;
2075         pme->nodeid_major = 0;
2076         pme->nodeid_minor = 0;
2077     }
2078     else
2079     {
2080         if (nnodes_minor == 1)
2081         {
2082 #ifdef GMX_MPI
2083             pme->mpi_comm_d[0] = pme->mpi_comm;
2084             pme->mpi_comm_d[1] = NULL;
2085 #endif
2086             pme->ndecompdim = 1;
2087             pme->nodeid_major = pme->nodeid;
2088             pme->nodeid_minor = 0;
2089
2090         }
2091         else if (nnodes_major == 1)
2092         {
2093 #ifdef GMX_MPI
2094             pme->mpi_comm_d[0] = NULL;
2095             pme->mpi_comm_d[1] = pme->mpi_comm;
2096 #endif
2097             pme->ndecompdim = 1;
2098             pme->nodeid_major = 0;
2099             pme->nodeid_minor = pme->nodeid;
2100         }
2101         else
2102         {
2103             if (pme->nnodes % nnodes_major != 0)
2104             {
2105                 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
2106             }
2107             pme->ndecompdim = 2;
2108
2109 #ifdef GMX_MPI
2110             MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
2111                            pme->nodeid,&pme->mpi_comm_d[0]);  /* My communicator along major dimension */
2112             MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
2113                            pme->nodeid,&pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
2114
2115             MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
2116             MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
2117             MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
2118             MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor);
2119 #endif
2120         }
2121         pme->bPPnode = (cr->duty & DUTY_PP);
2122     }
2123
2124     if (ir->ePBC == epbcSCREW)
2125     {
2126         gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
2127     }
2128
2129     pme->bFEP        = ((ir->efep != efepNO) && bFreeEnergy);
2130     pme->nkx         = ir->nkx;
2131     pme->nky         = ir->nky;
2132     pme->nkz         = ir->nkz;
2133     pme->pme_order   = ir->pme_order;
2134     pme->epsilon_r   = ir->epsilon_r;
2135
2136     /* Currently pme.c supports only the fft5d FFT code.
2137      * Therefore the grid always needs to be divisible by nnodes.
2138      * When the old 1D code is also supported again, change this check.
2139      *
2140      * This check should be done before calling gmx_pme_init
2141      * and fplog should be passed iso stderr.
2142      *
2143     if (pme->ndecompdim >= 2)
2144     */
2145     if (pme->ndecompdim >= 1)
2146     {
2147         /*
2148         gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
2149                                         'x',nnodes_major,&pme->nkx);
2150         gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
2151                                         'y',nnodes_minor,&pme->nky);
2152         */
2153     }
2154
2155     if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
2156         pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
2157         pme->nkz <= pme->pme_order)
2158     {
2159         gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_ordern for x and/or y",pme->pme_order);
2160     }
2161
2162     if (pme->nnodes > 1) {
2163         double imbal;
2164
2165 #ifdef GMX_MPI
2166         MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
2167         MPI_Type_commit(&(pme->rvec_mpi));
2168 #endif
2169
2170         /* Note that the charge spreading and force gathering, which usually
2171          * takes about the same amount of time as FFT+solve_pme,
2172          * is always fully load balanced
2173          * (unless the charge distribution is inhomogeneous).
2174          */
2175
2176         imbal = pme_load_imbalance(pme);
2177         if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
2178         {
2179             fprintf(stderr,
2180                     "\n"
2181                     "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
2182                     "      For optimal PME load balancing\n"
2183                     "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
2184                     "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
2185                     "\n",
2186                     (int)((imbal-1)*100 + 0.5),
2187                     pme->nkx,pme->nky,pme->nnodes_major,
2188                     pme->nky,pme->nkz,pme->nnodes_minor);
2189         }
2190     }
2191
2192     init_overlap_comm(&pme->overlap[0],pme->pme_order,
2193 #ifdef GMX_MPI
2194                       pme->mpi_comm_d[0],
2195 #endif
2196                       pme->nnodes_major,pme->nodeid_major,pme->nkx);
2197
2198     init_overlap_comm(&pme->overlap[1],pme->pme_order,
2199 #ifdef GMX_MPI
2200                       pme->mpi_comm_d[1],
2201 #endif
2202                       pme->nnodes_minor,pme->nodeid_minor,pme->nky);
2203
2204     snew(pme->bsp_mod[XX],pme->nkx);
2205     snew(pme->bsp_mod[YY],pme->nky);
2206     snew(pme->bsp_mod[ZZ],pme->nkz);
2207
2208     /* Allocate data for the interpolation grid, including overlap */
2209     pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
2210                       pme->overlap[0].s2g0[pme->nodeid_major];
2211     pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
2212                       pme->overlap[1].s2g0[pme->nodeid_minor];
2213     pme->pmegrid_nz = pme->nkz + pme->pme_order - 1;
2214
2215     pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
2216     pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
2217     pme->pmegrid_start_iz = 0;
2218
2219     make_gridindex5_to_localindex(pme->nkx,
2220                                   pme->pmegrid_start_ix,
2221                                   pme->overlap[0].s2g0[pme->nodeid_major+1],
2222                                   &pme->nnx,&pme->fshx);
2223     make_gridindex5_to_localindex(pme->nky,
2224                                   pme->pmegrid_start_iy,
2225                                   pme->overlap[1].s2g0[pme->nodeid_minor+1],
2226                                   &pme->nny,&pme->fshy);
2227     make_gridindex5_to_localindex(pme->nkz,
2228                                   pme->pmegrid_start_iz,
2229                                   pme->nkz,
2230                                   &pme->nnz,&pme->fshz);
2231
2232     snew(pme->pmegridA,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
2233
2234     /* For non-divisible grid we need pme_order iso pme_order-1 */
2235     bufsizex = pme->pme_order*pme->pmegrid_ny*pme->pmegrid_nz;
2236     bufsizey = pme->pmegrid_nx*pme->pme_order*pme->nkz;
2237     bufsize  = (bufsizex>bufsizey) ? bufsizex : bufsizey;
2238
2239     snew(pme->pmegrid_sendbuf,bufsize);
2240     snew(pme->pmegrid_recvbuf,bufsize);
2241
2242     ndata[0] = pme->nkx;
2243     ndata[1] = pme->nky;
2244     ndata[2] = pme->nkz;
2245
2246     /* This routine will allocate the grid data to fit the FFTs */
2247     gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
2248                             &pme->fftgridA,&pme->cfftgridA,
2249                             pme->mpi_comm_d,
2250                             pme->overlap[0].s2g0,pme->overlap[1].s2g0,
2251                             bReproducible);
2252
2253     if (bFreeEnergy)
2254     {
2255         snew(pme->pmegridB,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
2256         gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
2257                                 &pme->fftgridB,&pme->cfftgridB,
2258                                 pme->mpi_comm_d,
2259                                 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
2260                                 bReproducible);
2261     } else
2262     {
2263         pme->pmegridB    = NULL;
2264         pme->fftgridB    = NULL;
2265         pme->cfftgridB   = NULL;
2266     }
2267
2268     make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
2269
2270     /* Use atc[0] for spreading */
2271     init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
2272     if (pme->ndecompdim >= 2)
2273     {
2274         init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
2275     }
2276
2277     if (pme->nnodes == 1) {
2278         pme->atc[0].n = homenr;
2279         pme_realloc_atomcomm_things(&pme->atc[0]);
2280     }
2281
2282     /* Use fft5d, order after FFT is y major, z, x minor */
2283     pme->work_nalloc = pme->nkx;
2284     snew(pme->work_mhx,pme->work_nalloc);
2285     snew(pme->work_mhy,pme->work_nalloc);
2286     snew(pme->work_mhz,pme->work_nalloc);
2287     snew(pme->work_m2,pme->work_nalloc);
2288     snew(pme->work_denom,pme->work_nalloc);
2289     /* Allocate an aligned pointer for SSE operations, including 3 extra
2290      * elements at the end since SSE operates on 4 elements at a time.
2291      */
2292     snew(pme->work_tmp1_alloc,pme->work_nalloc+8);
2293     pme->work_tmp1 = (real *) (((size_t) pme->work_tmp1_alloc + 16) & (~((size_t) 15)));
2294     snew(pme->work_m2inv,pme->work_nalloc);
2295
2296     *pmedata = pme;
2297
2298     return 0;
2299 }
2300
2301 static void spread_on_grid(gmx_pme_t pme,
2302                            pme_atomcomm_t *atc,real *grid,
2303                            bool bCalcSplines,bool bSpread)
2304 {
2305     if (bCalcSplines)
2306     {
2307
2308         /* Compute fftgrid index for all atoms,
2309          * with help of some extra variables.
2310          */
2311         calc_interpolation_idx(pme,atc);
2312
2313         /* make local bsplines  */
2314         make_bsplines(atc->theta,atc->dtheta,pme->pme_order,
2315                       atc->fractx,atc->n,atc->q,pme->bFEP);
2316     }
2317
2318     if (bSpread)
2319     {
2320         /* put local atoms on grid. */
2321         spread_q_bsplines(pme,atc,grid);
2322     }
2323 }
2324
2325 void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
2326 {
2327     pme_atomcomm_t *atc;
2328     real *grid;
2329
2330     if (pme->nnodes > 1)
2331     {
2332         gmx_incons("gmx_pme_calc_energy called in parallel");
2333     }
2334     if (pme->bFEP > 1)
2335     {
2336         gmx_incons("gmx_pme_calc_energy with free energy");
2337     }
2338
2339     atc = &pme->atc_energy;
2340     atc->nslab     = 1;
2341     atc->bSpread   = TRUE;
2342     atc->pme_order = pme->pme_order;
2343     atc->n         = n;
2344     pme_realloc_atomcomm_things(atc);
2345     atc->x         = x;
2346     atc->q         = q;
2347
2348     /* We only use the A-charges grid */
2349     grid = pme->pmegridA;
2350
2351     spread_on_grid(pme,atc,grid,TRUE,FALSE);
2352
2353     *V = gather_energy_bsplines(pme,grid,atc);
2354 }
2355
2356
2357 static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
2358         t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel)
2359 {
2360     /* Reset all the counters related to performance over the run */
2361     wallcycle_stop(wcycle,ewcRUN);
2362     wallcycle_reset_all(wcycle);
2363     init_nrnb(nrnb);
2364     ir->init_step += step_rel;
2365     ir->nsteps    -= step_rel;
2366     wallcycle_start(wcycle,ewcRUN);
2367 }
2368
2369
2370 int gmx_pmeonly(gmx_pme_t pme,
2371                 t_commrec *cr,    t_nrnb *nrnb,
2372                 gmx_wallcycle_t wcycle,
2373                 real ewaldcoeff,  bool bGatherOnly,
2374                 t_inputrec *ir)
2375 {
2376     gmx_pme_pp_t pme_pp;
2377     int  natoms;
2378     matrix box;
2379     rvec *x_pp=NULL,*f_pp=NULL;
2380     real *chargeA=NULL,*chargeB=NULL;
2381     real lambda=0;
2382     int  maxshift0=0,maxshift1=0;
2383     real energy,dvdlambda;
2384     matrix vir;
2385     float cycles;
2386     int  count;
2387     bool bEnerVir;
2388     gmx_large_int_t step,step_rel;
2389
2390
2391     pme_pp = gmx_pme_pp_init(cr);
2392
2393     init_nrnb(nrnb);
2394
2395     count = 0;
2396     do /****** this is a quasi-loop over time steps! */
2397     {
2398         /* Domain decomposition */
2399         natoms = gmx_pme_recv_q_x(pme_pp,
2400                                   &chargeA,&chargeB,box,&x_pp,&f_pp,
2401                                   &maxshift0,&maxshift1,
2402                                   &pme->bFEP,&lambda,
2403                                   &bEnerVir,
2404                                   &step);
2405
2406         if (natoms == -1) {
2407             /* We should stop: break out of the loop */
2408             break;
2409         }
2410
2411         step_rel = step - ir->init_step;
2412
2413         if (count == 0)
2414             wallcycle_start(wcycle,ewcRUN);
2415
2416         wallcycle_start(wcycle,ewcPMEMESH);
2417
2418         dvdlambda = 0;
2419         clear_mat(vir);
2420         gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
2421                    cr,maxshift0,maxshift1,nrnb,wcycle,vir,ewaldcoeff,
2422                    &energy,lambda,&dvdlambda,
2423                    GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
2424
2425         cycles = wallcycle_stop(wcycle,ewcPMEMESH);
2426
2427         gmx_pme_send_force_vir_ener(pme_pp,
2428                                     f_pp,vir,energy,dvdlambda,
2429                                     cycles);
2430
2431         count++;
2432
2433         if (step_rel == wcycle_get_reset_counters(wcycle))
2434         {
2435             /* Reset all the counters related to performance over the run */
2436             reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
2437             wcycle_set_reset_counters(wcycle, 0);
2438         }
2439
2440     } /***** end of quasi-loop, we stop with the break above */
2441     while (TRUE);
2442
2443     return 0;
2444 }
2445
2446 int gmx_pme_do(gmx_pme_t pme,
2447                int start,       int homenr,
2448                rvec x[],        rvec f[],
2449                real *chargeA,   real *chargeB,
2450                matrix box,      t_commrec *cr,
2451                int  maxshift0,  int maxshift1,
2452                t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
2453                matrix vir,      real ewaldcoeff,
2454                real *energy,    real lambda,
2455                real *dvdlambda, int flags)
2456 {
2457     int     q,d,i,j,ntot,npme;
2458     int     nx,ny,nz;
2459     int     n_d,local_ny;
2460     int     loop_count;
2461     pme_atomcomm_t *atc=NULL;
2462     real *  grid=NULL;
2463     real    *ptr;
2464     rvec    *x_d,*f_d;
2465     real    *charge=NULL,*q_d,vol;
2466     real    energy_AB[2];
2467     matrix  vir_AB[2];
2468     bool    bClearF;
2469     gmx_parallel_3dfft_t pfft_setup;
2470     real *  fftgrid;
2471     t_complex * cfftgrid;
2472
2473     if (pme->nnodes > 1) {
2474         atc = &pme->atc[0];
2475         atc->npd = homenr;
2476         if (atc->npd > atc->pd_nalloc) {
2477             atc->pd_nalloc = over_alloc_dd(atc->npd);
2478             srenew(atc->pd,atc->pd_nalloc);
2479         }
2480         atc->maxshift = (atc->dimind==0 ? maxshift0 : maxshift1);
2481     }
2482
2483     for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
2484         if (q == 0) {
2485             grid = pme->pmegridA;
2486             fftgrid = pme->fftgridA;
2487             cfftgrid = pme->cfftgridA;
2488             pfft_setup = pme->pfft_setupA;
2489             charge = chargeA+start;
2490         } else {
2491             grid = pme->pmegridB;
2492             fftgrid = pme->fftgridB;
2493             cfftgrid = pme->cfftgridB;
2494             pfft_setup = pme->pfft_setupB;
2495             charge = chargeB+start;
2496         }
2497         /* Unpack structure */
2498         if (debug) {
2499             fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
2500                     cr->nnodes,cr->nodeid);
2501             fprintf(debug,"Grid = %p\n",(void*)grid);
2502             if (grid == NULL)
2503                 gmx_fatal(FARGS,"No grid!");
2504         }
2505         where();
2506
2507         m_inv_ur0(box,pme->recipbox);
2508
2509         if (pme->nnodes == 1) {
2510             atc = &pme->atc[0];
2511             if (DOMAINDECOMP(cr)) {
2512                 atc->n = homenr;
2513                 pme_realloc_atomcomm_things(atc);
2514             }
2515             atc->x = x;
2516             atc->q = charge;
2517             atc->f = f;
2518         } else {
2519             wallcycle_start(wcycle,ewcPME_REDISTXF);
2520             for(d=pme->ndecompdim-1; d>=0; d--)
2521             {
2522                 if (d == pme->ndecompdim-1)
2523                 {
2524                     n_d = homenr;
2525                     x_d = x + start;
2526                     q_d = charge;
2527                 }
2528                 else
2529                 {
2530                     n_d = pme->atc[d+1].n;
2531                     x_d = atc->x;
2532                     q_d = atc->q;
2533                 }
2534                 atc = &pme->atc[d];
2535                 atc->npd = n_d;
2536                 if (atc->npd > atc->pd_nalloc) {
2537                     atc->pd_nalloc = over_alloc_dd(atc->npd);
2538                     srenew(atc->pd,atc->pd_nalloc);
2539                 }
2540                 atc->maxshift = (atc->dimind==0 ? maxshift0 : maxshift1);
2541                 pme_calc_pidx(n_d,pme->recipbox,x_d,atc);
2542                 where();
2543
2544                 GMX_BARRIER(cr->mpi_comm_mygroup);
2545                 /* Redistribute x (only once) and qA or qB */
2546                 if (DOMAINDECOMP(cr)) {
2547                     dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc);
2548                 } else {
2549                     pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc);
2550                 }
2551             }
2552             where();
2553
2554             wallcycle_stop(wcycle,ewcPME_REDISTXF);
2555         }
2556
2557         if (debug)
2558             fprintf(debug,"Node= %6d, pme local particles=%6d\n",
2559                     cr->nodeid,atc->n);
2560
2561         if (flags & GMX_PME_SPREAD_Q)
2562         {
2563             wallcycle_start(wcycle,ewcPME_SPREADGATHER);
2564
2565             /* Spread the charges on a grid */
2566             GMX_MPE_LOG(ev_spread_on_grid_start);
2567
2568             /* Spread the charges on a grid */
2569             spread_on_grid(pme,&pme->atc[0],grid,q==0,TRUE);
2570             GMX_MPE_LOG(ev_spread_on_grid_finish);
2571
2572             if (q == 0)
2573             {
2574                 inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n);
2575             }
2576             inc_nrnb(nrnb,eNR_SPREADQBSP,
2577                      pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
2578
2579             wrap_periodic_pmegrid(pme,grid);
2580
2581             /* sum contributions to local grid from other nodes */
2582 #ifdef GMX_MPI
2583             if (pme->nnodes > 1) {
2584                 GMX_BARRIER(cr->mpi_comm_mygroup);
2585                 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
2586                 where();
2587             }
2588 #endif
2589             where();
2590
2591             copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
2592
2593             wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
2594         }
2595
2596         if (flags & GMX_PME_SOLVE)
2597         {
2598             /* do 3d-fft */
2599             GMX_BARRIER(cr->mpi_comm_mygroup);
2600             GMX_MPE_LOG(ev_gmxfft3d_start);
2601             wallcycle_start(wcycle,ewcPME_FFT);
2602             gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,fftgrid,cfftgrid);
2603             wallcycle_stop(wcycle,ewcPME_FFT);
2604             GMX_MPE_LOG(ev_gmxfft3d_finish);
2605             where();
2606
2607             /* solve in k-space for our local cells */
2608             vol = det(box);
2609             GMX_BARRIER(cr->mpi_comm_mygroup);
2610             GMX_MPE_LOG(ev_solve_pme_start);
2611             wallcycle_start(wcycle,ewcPME_SOLVE);
2612             loop_count =
2613                 solve_pme_yzx(pme,cfftgrid,ewaldcoeff,vol,
2614                               flags & GMX_PME_CALC_ENER_VIR,
2615                               &energy_AB[q],vir_AB[q]);
2616             wallcycle_stop(wcycle,ewcPME_SOLVE);
2617             where();
2618             GMX_MPE_LOG(ev_solve_pme_finish);
2619             inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
2620         }
2621
2622         if (flags & GMX_PME_CALC_F)
2623         {
2624
2625             /* do 3d-invfft */
2626             GMX_BARRIER(cr->mpi_comm_mygroup);
2627             GMX_MPE_LOG(ev_gmxfft3d_start);
2628             where();
2629             wallcycle_start(wcycle,ewcPME_FFT);
2630             gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,cfftgrid,fftgrid);
2631             wallcycle_stop(wcycle,ewcPME_FFT);
2632
2633             where();
2634             GMX_MPE_LOG(ev_gmxfft3d_finish);
2635
2636             if (pme->nodeid == 0)
2637             {
2638                 ntot = pme->nkx*pme->nky*pme->nkz;
2639                 npme  = ntot*log((real)ntot)/log(2.0);
2640                 inc_nrnb(nrnb,eNR_FFT,2*npme);
2641             }
2642
2643             wallcycle_start(wcycle,ewcPME_SPREADGATHER);
2644
2645             copy_fftgrid_to_pmegrid(pme,fftgrid,grid);
2646
2647             /* distribute local grid to all nodes */
2648 #ifdef GMX_MPI
2649             if (pme->nnodes > 1) {
2650                 GMX_BARRIER(cr->mpi_comm_mygroup);
2651                 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD);
2652             }
2653 #endif
2654             where();
2655
2656             unwrap_periodic_pmegrid(pme,grid);
2657
2658             /* interpolate forces for our local atoms */
2659             GMX_BARRIER(cr->mpi_comm_mygroup);
2660             GMX_MPE_LOG(ev_gather_f_bsplines_start);
2661
2662             where();
2663
2664             /* If we are running without parallelization,
2665              * atc->f is the actual force array, not a buffer,
2666              * therefore we should not clear it.
2667              */
2668             bClearF = (q == 0 && PAR(cr));
2669             gather_f_bsplines(pme,grid,bClearF,&pme->atc[0],
2670                               pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
2671             where();
2672
2673             GMX_MPE_LOG(ev_gather_f_bsplines_finish);
2674
2675             inc_nrnb(nrnb,eNR_GATHERFBSP,
2676                      pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
2677             wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
2678        }
2679     } /* of q-loop */
2680
2681     if ((flags & GMX_PME_CALC_F) && pme->nnodes > 1) {
2682         wallcycle_start(wcycle,ewcPME_REDISTXF);
2683         for(d=0; d<pme->ndecompdim; d++)
2684         {
2685             atc = &pme->atc[d];
2686             if (d == pme->ndecompdim - 1)
2687             {
2688                 n_d = homenr;
2689                 f_d = f + start;
2690             }
2691             else
2692             {
2693                 n_d = pme->atc[d+1].n;
2694                 f_d = pme->atc[d+1].f;
2695             }
2696             GMX_BARRIER(cr->mpi_comm_mygroup);
2697             if (DOMAINDECOMP(cr)) {
2698                 dd_pmeredist_f(pme,atc,n_d,f_d,
2699                                d==pme->ndecompdim-1 && pme->bPPnode);
2700             } else {
2701                 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
2702             }
2703         }
2704
2705         wallcycle_stop(wcycle,ewcPME_REDISTXF);
2706     }
2707     where();
2708
2709     if (!pme->bFEP) {
2710         *energy = energy_AB[0];
2711         m_add(vir,vir_AB[0],vir);
2712     } else {
2713         *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
2714         *dvdlambda += energy_AB[1] - energy_AB[0];
2715         for(i=0; i<DIM; i++)
2716             for(j=0; j<DIM; j++)
2717                 vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] + lambda*vir_AB[1][i][j];
2718     }
2719
2720     if (debug)
2721         fprintf(debug,"PME mesh energy: %g\n",*energy);
2722
2723     return 0;
2724 }