src/mdlib/fft5d.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *                This source code is part of
   4  *
   5  *                 G   R   O   M   A   C   S
   6  *
   7  *          GROningen MAchine for Chemical Simulations
   8  *
   9  *                        VERSION 4.5
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2008, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Groningen Machine for Chemical Simulation
  34  */
  35 #ifdef HAVE_CONFIG_H
  36 #include <config.h>
  37 #endif
  38
  39
  40 #include <stdio.h>
  41 #include <stdlib.h>
  42 #include <string.h>
  43
  44 #ifdef NOGMX
  45 #define GMX_PARALLEL_ENV_INITIALIZED 1
  46 #else
  47 #include "main.h"
  48 #define GMX_PARALLEL_ENV_INITIALIZED gmx_parallel_env_initialized()
  49 #endif
  50
  51 #ifdef GMX_LIB_MPI
  52 #include <mpi.h>
  53 #endif
  54 #ifdef GMX_THREADS
  55 #include "tmpi.h"
  56 #endif
  57
  58 #ifdef FFT5D_THREADS
  59 #include <omp.h>
  60 /* requires fftw compiled with openmp */
  61 #define FFT5D_FFTW_THREADS
  62 #endif
  63
  64 #include "fft5d.h"
  65 #include <float.h>
  66 #include <math.h>
  67 #include <assert.h>
  68
  69 #ifndef __FLT_EPSILON__
  70 #define __FLT_EPSILON__ FLT_EPSILON
  71 #define __DBL_EPSILON__ DBL_EPSILON
  72 #endif
  73
  74 #ifdef NOGMX
  75 FILE* debug=0;
  76 #endif
  77
  78 #include "gmx_fatal.h"
  79
  80
  81 #ifdef GMX_FFT_FFTW3
  82 #ifdef GMX_THREADS
  83 /* none of the fftw3 calls, except execute(), are thread-safe, so
  84    we need to serialize them with this mutex. */
  85 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
  86
  87 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex);
  88 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex);
  89 #else /* GMX_THREADS */
  90 #define FFTW_LOCK
  91 #define FFTW_UNLOCK
  92 #endif /* GMX_THREADS */
  93 #endif /* GMX_FFT_FFTW3 */
  94
  95 static double fft5d_fmax(double a, double b){
  96         return (a>b)?a:b;
  97 }
  98
  99 /* largest factor smaller than sqrt */
 100 static int lfactor(int z) {
 101         int i;
 102         for (i=sqrt(z);;i--)
 103                 if (z%i==0) return i;
 104 }
 105
 106 /* largest factor */
 107 static int l2factor(int z) {
 108         int i;
 109         if (z==1) return 1;
 110         for (i=z/2;;i--)
 111                 if (z%i==0) return i;
 112 }
 113
 114 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
 115 static int lpfactor(int z) {
 116         int f = l2factor(z);
 117         if (f==1) return z;
 118         return fft5d_fmax(lpfactor(f),lpfactor(z/f));
 119 }
 120
 121 #ifndef GMX_MPI
 122 #ifdef HAVE_GETTIMEOFDAY
 123 #include <sys/time.h>
 124 double MPI_Wtime() {
 125     struct timeval tv;
 126     gettimeofday(&tv,0);
 127     return tv.tv_sec+tv.tv_usec*1e-6;
 128 }
 129 #else
 130 double MPI_Wtime() {
 131     return 0.0;
 132 }
 133 #endif
 134 #endif
 135
 136 static int vmax(int* a, int s) {
 137     int i,max=0;
 138     for (i=0;i<s;i++)
 139     {
 140         if (a[i]>max) max=a[i];
 141     }
 142     return max;
 143 }
 144
 145 /*
 146 copied here from fftgrid, because:
 147 1. function there not publically available
 148 2. not sure whether we keep fftgrid
 149 3. less dependencies for fft5d
 150
 151 Only used for non-fftw case
 152 */
 153 static void *
 154 gmx_calloc_aligned(size_t size)
 155 {
 156     void *p0,*p;
 157
 158     /*We initialize by zero for Valgrind
 159       For non-divisible case we communicate more than the data.
 160       If we don't initialize the data we communicate uninitialized data*/
 161     p0 = calloc(size+32,1);
 162
 163     if(p0 == NULL)
 164     {
 165         gmx_fatal(FARGS,"Failed to allocated %u bytes of aligned memory.",size+32);
 166     }
 167
 168     p = (void *) (((size_t) p0 + 32) & (~((size_t) 31)));
 169
 170     /* Yeah, yeah, we cannot free this pointer, but who cares... */
 171     return p;
 172 }
 173
 174
 175 /* NxMxK the size of the data
 176  * comm communicator to use for fft5d
 177  * P0 number of processor in 1st axes (can be null for automatic)
 178  * lin is allocated by fft5d because size of array is only known after planning phase */
 179 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout)
 180 {
 181
 182     int P[2],bMaster,prank[2],i;
 183     int rNG,rMG,rKG;
 184     int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
 185     int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
 186     int C[3],rC[3],nP[2];
 187     int lsize;
 188     t_complex *lin=0,*lout=0;
 189     fft5d_plan plan;
 190     int s;
 191
 192     /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
 193 #ifdef GMX_MPI
 194     if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != 0)
 195     {
 196         MPI_Comm_size(comm[0],&P[0]);
 197         MPI_Comm_rank(comm[0],&prank[0]);
 198     }
 199     else
 200 #endif
 201     {
 202         P[0] = 1;
 203         prank[0] = 0;
 204     }
 205 #ifdef GMX_MPI
 206     if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != 0)
 207     {
 208         MPI_Comm_size(comm[1],&P[1]);
 209         MPI_Comm_rank(comm[1],&prank[1]);
 210     }
 211     else
 212 #endif
 213     {
 214         P[1] = 1;
 215         prank[1] = 0;
 216     }
 217
 218     bMaster=(prank[0]==0&&prank[1]==0);
 219
 220
 221     if (debug)
 222     {
 223         fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
 224                 P[0],P[1],prank[0],prank[1]);
 225     }
 226
 227     if (bMaster) {
 228         if (debug)
 229             fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
 230                 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
 231         /* The check below is not correct, one prime factor 11 or 13 is ok.
 232         if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
 233             printf("WARNING: FFT very slow with prime factors larger 7\n");
 234             printf("Change FFT size or in case you cannot change it look at\n");
 235             printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
 236         }
 237         */
 238     }
 239
 240     if (NG==0 || MG==0 || KG==0) {
 241         if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
 242         return 0;
 243     }
 244
 245     rNG=NG;rMG=MG;rKG=KG;
 246
 247     if (flags&FFT5D_REALCOMPLEX) {
 248         if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
 249         else {
 250             if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
 251             else KG=KG/2+1;
 252         }
 253     }
 254
 255
 256     /*for transpose we need to know the size for each processor not only our own size*/
 257
 258     N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
 259     M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
 260     K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
 261     oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
 262     oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
 263     oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
 264
 265     for (i=0;i<P[0];i++)
 266     {
 267         #define EVENDIST
 268         #ifndef EVENDIST
 269         oN0[i]=i*ceil((double)NG/P[0]);
 270         oM0[i]=i*ceil((double)MG/P[0]);
 271         oK0[i]=i*ceil((double)KG/P[0]);
 272         #else
 273         oN0[i]=(NG*i)/P[0];
 274         oM0[i]=(MG*i)/P[0];
 275         oK0[i]=(KG*i)/P[0];
 276         #endif
 277     }
 278     for (i=0;i<P[1];i++)
 279     {
 280         #ifndef EVENDIST
 281         oN1[i]=i*ceil((double)NG/P[1]);
 282         oM1[i]=i*ceil((double)MG/P[1]);
 283         oK1[i]=i*ceil((double)KG/P[1]);
 284         #else
 285         oN1[i]=(NG*i)/P[1];
 286         oM1[i]=(MG*i)/P[1];
 287         oK1[i]=(KG*i)/P[1];
 288         #endif
 289     }
 290     for (i=0;i<P[0]-1;i++)
 291     {
 292         N0[i]=oN0[i+1]-oN0[i];
 293         M0[i]=oM0[i+1]-oM0[i];
 294         K0[i]=oK0[i+1]-oK0[i];
 295     }
 296     N0[P[0]-1]=NG-oN0[P[0]-1];
 297     M0[P[0]-1]=MG-oM0[P[0]-1];
 298     K0[P[0]-1]=KG-oK0[P[0]-1];
 299     for (i=0;i<P[1]-1;i++)
 300     {
 301         N1[i]=oN1[i+1]-oN1[i];
 302         M1[i]=oM1[i+1]-oM1[i];
 303         K1[i]=oK1[i+1]-oK1[i];
 304     }
 305     N1[P[1]-1]=NG-oN1[P[1]-1];
 306     M1[P[1]-1]=MG-oM1[P[1]-1];
 307     K1[P[1]-1]=KG-oK1[P[1]-1];
 308
 309     /* for step 1-3 the local N,M,K sizes of the transposed system
 310        C: contiguous dimension, and nP: number of processor in subcommunicator
 311        for that step */
 312
 313
 314     pM[0] = M0[prank[0]];
 315     oM[0] = oM0[prank[0]];
 316     pK[0] = K1[prank[1]];
 317     oK[0] = oK1[prank[1]];
 318     C[0] = NG;
 319     rC[0] = rNG;
 320     if (!(flags&FFT5D_ORDER_YZ)) {
 321         N[0] = vmax(N1,P[1]);
 322         M[0] = M0[prank[0]];
 323         K[0] = vmax(K1,P[1]);
 324         pN[0] = N1[prank[1]];
 325         iNout[0] = N1;
 326         oNout[0] = oN1;
 327         nP[0] = P[1];
 328         C[1] = KG;
 329         rC[1] =rKG;
 330         N[1] = vmax(K0,P[0]);
 331         pN[1] = K0[prank[0]];
 332         iNin[1] = K1;
 333         oNin[1] = oK1;
 334         iNout[1] = K0;
 335         oNout[1] = oK0;
 336         M[1] = vmax(M0,P[0]);
 337         pM[1] = M0[prank[0]];
 338         oM[1] = oM0[prank[0]];
 339         K[1] = N1[prank[1]];
 340         pK[1] = N1[prank[1]];
 341         oK[1] = oN1[prank[1]];
 342         nP[1] = P[0];
 343         C[2] = MG;
 344         rC[2] = rMG;
 345         iNin[2] = M0;
 346         oNin[2] = oM0;
 347         M[2] = vmax(K0,P[0]);
 348         pM[2] = K0[prank[0]];
 349         oM[2] = oK0[prank[0]];
 350         K[2] = vmax(N1,P[1]);
 351         pK[2] = N1[prank[1]];
 352         oK[2] = oN1[prank[1]];
 353         free(N0); free(oN0); /*these are not used for this order*/
 354         free(M1); free(oM1); /*the rest is freed in destroy*/
 355     } else {
 356         N[0] = vmax(N0,P[0]);
 357         M[0] = vmax(M0,P[0]);
 358         K[0] = K1[prank[1]];
 359         pN[0] = N0[prank[0]];
 360         iNout[0] = N0;
 361         oNout[0] = oN0;
 362         nP[0] = P[0];
 363         C[1] = MG;
 364         rC[1] =rMG;
 365         N[1] = vmax(M1,P[1]);
 366         pN[1] = M1[prank[1]];
 367         iNin[1] = M0;
 368         oNin[1] = oM0;
 369         iNout[1] = M1;
 370         oNout[1] = oM1;
 371         M[1] = N0[prank[0]];
 372         pM[1] = N0[prank[0]];
 373         oM[1] = oN0[prank[0]];
 374         K[1] = vmax(K1,P[1]);
 375         pK[1] = K1[prank[1]];
 376         oK[1] = oK1[prank[1]];
 377         nP[1] = P[1];
 378         C[2] = KG;
 379         rC[2] = rKG;
 380         iNin[2] = K1;
 381         oNin[2] = oK1;
 382         M[2] = vmax(N0,P[0]);
 383         pM[2] = N0[prank[0]];
 384         oM[2] = oN0[prank[0]];
 385         K[2] = vmax(M1,P[1]);
 386         pK[2] = M1[prank[1]];
 387         oK[2] = oM1[prank[1]];
 388         free(N1); free(oN1); /*these are not used for this order*/
 389         free(K0); free(oK0); /*the rest is freed in destroy*/
 390     }
 391
 392     /*
 393       Difference between x-y-z regarding 2d decomposition is whether they are
 394       distributed along axis 1, 2 or both
 395     */
 396
 397     /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
 398     lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
 399     /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
 400     if (!(flags&FFT5D_NOMALLOC)) {
 401         lin = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
 402         lout = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
 403     } else {
 404         lin = *rlin;
 405         lout = *rlout;
 406     }
 407
 408     plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
 409
 410
 411 #ifdef FFT5D_THREADS
 412 #ifdef FFT5D_FFTW_THREADS
 413     FFTW(init_threads)();
 414     int nthreads;
 415     #pragma omp parallel
 416     {
 417         #pragma omp master
 418         {
 419             nthreads = omp_get_num_threads();
 420         }
 421     }
 422     if (prank[0] == 0 && prank[1] == 0)
 423     {
 424         printf("Running fftw on %d threads\n",nthreads);
 425     }
 426     FFTW(plan_with_nthreads)(nthreads);
 427 #endif
 428 #endif
 429
 430 #ifdef GMX_FFT_FFTW3  /*if not FFTW - then we don't do a 3d plan but insead only 1D plans */
 431     if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1))) {  /*don't do 3d plan in parallel or if in_place requested */
 432             int fftwflags=FFTW_DESTROY_INPUT;
 433             fftw_iodim dims[3];
 434             int inNG=NG,outMG=MG,outKG=KG;
 435
 436             FFTW_LOCK;
 437             if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
 438             if (flags&FFT5D_REALCOMPLEX) {
 439                 if (!(flags&FFT5D_BACKWARD)) {  /*input pointer is not complex*/
 440                     inNG*=2;
 441                 } else {                        /*output pointer is not complex*/
 442                     if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
 443                     else outKG*=2;
 444                 }
 445             }
 446
 447             if (!(flags&FFT5D_BACKWARD)) {
 448                 dims[0].n  = KG;
 449                 dims[1].n  = MG;
 450                 dims[2].n  = rNG;
 451
 452                 dims[0].is = inNG*MG;     /*N M K*/
 453                 dims[1].is = inNG;
 454                 dims[2].is = 1;
 455                 if (!(flags&FFT5D_ORDER_YZ)) {
 456                     dims[0].os = MG;       /*M K N*/
 457                     dims[1].os = 1;
 458                     dims[2].os = MG*KG;
 459                 } else  {
 460                     dims[0].os = 1;       /*K N M*/
 461                     dims[1].os = KG*NG;
 462                     dims[2].os = KG;
 463                 }
 464             } else {
 465                 if (!(flags&FFT5D_ORDER_YZ)) {
 466                     dims[0].n  = NG;
 467                     dims[1].n  = KG;
 468                     dims[2].n  = rMG;
 469
 470                     dims[0].is = 1;
 471                     dims[1].is = NG*MG;
 472                     dims[2].is = NG;
 473
 474                     dims[0].os = outMG*KG;
 475                     dims[1].os = outMG;
 476                     dims[2].os = 1;
 477                 } else {
 478                     dims[0].n  = MG;
 479                     dims[1].n  = NG;
 480                     dims[2].n  = rKG;
 481
 482                     dims[0].is = NG;
 483                     dims[1].is = 1;
 484                     dims[2].is = NG*MG;
 485
 486                     dims[0].os = outKG*NG;
 487                     dims[1].os = outKG;
 488                     dims[2].os = 1;
 489                 }
 490             }
 491             if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
 492                 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
 493                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 494                                      (real*)lin, (FFTW(complex) *)lout,
 495                                      /*flags*/ fftwflags);
 496             } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
 497                 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
 498                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 499                                      (FFTW(complex) *)lin, (real*)lout,
 500                                      /*flags*/ fftwflags);
 501             } else {
 502                 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
 503                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 504                                      (FFTW(complex) *)lin, (FFTW(complex) *)lout,
 505                                      /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
 506             }
 507             FFTW_UNLOCK;
 508     }
 509     if (!plan->p3d) {  /* for decomposition and if 3d plan did not work */
 510 #endif /* GMX_FFT_FFTW3 */
 511         for (s=0;s<3;s++) {
 512             if (debug)
 513             {
 514                 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
 515                         s,rC[s],M[s],pK[s],C[s],lsize);
 516             }
 517             if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
 518                 gmx_fft_init_many_1d_real( &plan->p1d[s], rC[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 519             } else {
 520                 gmx_fft_init_many_1d     ( &plan->p1d[s],  C[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 521             }
 522         }
 523 #ifdef GMX_FFT_FFTW3
 524     }
 525 #endif
 526     if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
 527         plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
 528     } else {
 529         plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
 530     }
 531 #ifdef FFT5D_MPI_TRANSPOSE
 532     FFTW_LOCK
 533     for (s=0;s<2;s++) {
 534         if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
 535             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
 536         else
 537             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
 538     }
 539     FFTW_UNLOCK
 540 #endif
 541
 542
 543     plan->lin=lin;
 544     plan->lout=lout;
 545
 546     plan->NG=NG;plan->MG=MG;plan->KG=KG;
 547
 548     for (s=0;s<3;s++) {
 549         plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
 550         plan->oM[s]=oM[s];plan->oK[s]=oK[s];
 551         plan->C[s]=C[s];plan->rC[s]=rC[s];
 552         plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
 553     }
 554     for (s=0;s<2;s++) {
 555         plan->P[s]=nP[s];plan->coor[s]=prank[s];
 556     }
 557
 558 /*    plan->fftorder=fftorder;
 559     plan->direction=direction;
 560     plan->realcomplex=realcomplex;
 561 */
 562     plan->flags=flags;
 563     *rlin=lin;
 564     *rlout=lout;
 565     return plan;
 566 }
 567
 568
 569 enum order {
 570     XYZ,
 571     XZY,
 572     YXZ,
 573     YZX,
 574     ZXY,
 575     ZYX
 576 };
 577
 578
 579
 580 /*here x,y,z and N,M,K is in rotated coordinate system!!
 581   x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
 582   maxN,maxM,maxK is max size of local data
 583   pN, pM, pK is local size specific to current processor (only different to max if not divisible)
 584   NG, MG, KG is size of global data*/
 585 static void splitaxes(t_complex* lout,const t_complex* lin,
 586                       int maxN,int maxM,int maxK, int pN, int pM, int pK,
 587                       int P,int NG,int *N, int* oN)
 588 {
 589     int x,y,z,i;
 590     int in_i,out_i,in_z,out_z,in_y,out_y;
 591
 592 #ifdef FFT5D_THREADS
 593     int zi;
 594
 595     /* In the thread parallel case we want to loop over z and i
 596      * in a single for loop to allow for better load balancing.
 597      */
 598 #pragma omp parallel for private(z,in_z,out_z,i,in_i,out_i,y,in_y,out_y,x) schedule(static)
 599     for (zi=0; zi<pK*P; zi++)
 600     {
 601         z = zi/P;
 602         i = zi - z*P;
 603 #else
 604     for (z=0; z<pK; z++) /*3. z l*/
 605     {
 606 #endif
 607         in_z  = z*maxN*maxM;
 608         out_z = z*NG*pM;
 609
 610 #ifndef FFT5D_THREADS
 611         for (i=0; i<P; i++) /*index cube along long axis*/
 612 #endif
 613         {
 614             in_i  = in_z  + i*maxN*maxM*maxK;
 615             out_i = out_z + oN[i];
 616             for (y=0;y<pM;y++) { /*2. y k*/
 617                 in_y  = in_i  + y*maxN;
 618                 out_y = out_i + y*NG;
 619                 for (x=0;x<N[i];x++) { /*1. x j*/
 620                     lout[in_y+x] = lin[out_y+x];
 621                     /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
 622                     /*before split data contiguos - thus if different processor get different amount oN is different*/
 623                 }
 624             }
 625         }
 626     }
 627 }
 628
 629 /*make axis contiguous again (after AllToAll) and also do local transpose*/
 630 /*transpose mayor and major dimension
 631   variables see above
 632   the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
 633   N,M,K local dimensions
 634   KG global size*/
 635 static void joinAxesTrans13(t_complex* lin,const t_complex* lout,
 636                             int maxN,int maxM,int maxK,int pN, int pM, int pK,
 637                             int P,int KG, int* K, int* oK)
 638 {
 639     int i,x,y,z;
 640     int in_i,out_i,in_x,out_x,in_z,out_z;
 641
 642 #ifdef FFT5D_THREADS
 643     int xi;
 644
 645     /* In the thread parallel case we want to loop over x and i
 646      * in a single for loop to allow for better load balancing.
 647      */
 648 #pragma omp parallel for private(x,in_x,out_x,i,in_i,out_i,z,in_z,out_z,y) schedule(static)
 649     for (xi=0; xi<pN*P; xi++)
 650     {
 651         x = xi/P;
 652         i = xi - x*P;
 653 #else
 654     for (x=0;x<pN;x++) /*1.j*/
 655     {
 656 #endif
 657         in_x  = x*KG*pM;
 658         out_x = x;
 659
 660 #ifndef FFT5D_THREADS
 661         for (i=0;i<P;i++) /*index cube along long axis*/
 662 #endif
 663         {
 664             in_i  = in_x  + oK[i];
 665             out_i = out_x + i*maxM*maxN*maxK;
 666             for (z=0;z<K[i];z++) /*3.l*/
 667             {
 668                 in_z  = in_i  + z;
 669                 out_z = out_i + z*maxM*maxN;
 670                 for (y=0;y<pM;y++) { /*2.k*/
 671                     lin[in_z+y*KG] = lout[out_z+y*maxN];
 672                 }
 673             }
 674         }
 675     }
 676 }
 677
 678 /*make axis contiguous again (after AllToAll) and also do local transpose
 679   tranpose mayor and middle dimension
 680   variables see above
 681   the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
 682   N,M,K local size
 683   MG, global size*/
 684 static void joinAxesTrans12(t_complex* lin,const t_complex* lout,int maxN,int maxM,int maxK,int pN, int pM, int pK,
 685                 int P,int MG, int* M, int* oM) {
 686     int i,z,y,x;
 687     int in_i,out_i,in_z,out_z,in_x,out_x;
 688
 689 #ifdef FFT5D_THREADS
 690     int zi;
 691
 692     /* In the thread parallel case we want to loop over z and i
 693      * in a single for loop to allow for better load balancing.
 694      */
 695 #pragma omp parallel for private(i,in_i,out_i,z,in_z,out_z,in_x,out_x,x,y) schedule(static)
 696     for (zi=0; zi<pK*P; zi++)
 697     {
 698         z = zi/P;
 699         i = zi - z*P;
 700 #else
 701     for (z=0; z<pK; z++)
 702     {
 703 #endif
 704         in_z  = z*MG*pN;
 705         out_z = z*maxM*maxN;
 706
 707 #ifndef FFT5D_THREADS
 708         for (i=0; i<P; i++) /*index cube along long axis*/
 709 #endif
 710         {
 711             in_i  = in_z  + oM[i];
 712             out_i = out_z + i*maxM*maxN*maxK;
 713             for (x=0;x<pN;x++) {
 714                 in_x  = in_i  + x*MG;
 715                 out_x = out_i + x;
 716                 for (y=0;y<M[i];y++) {
 717                     lin[in_x+y] = lout[out_x+y*maxN];
 718                 }
 719             }
 720         }
 721     }
 722 }
 723
 724
 725 static void rotate(int x[]) {
 726     int t=x[0];
 727 /*    x[0]=x[2];
 728     x[2]=x[1];
 729     x[1]=t;*/
 730     x[0]=x[1];
 731     x[1]=x[2];
 732     x[2]=t;
 733 }
 734
 735 /*compute the offset to compare or print transposed local data in original input coordinates
 736   xs matrix dimension size, xl dimension length, xc decomposition offset
 737   s: step in computation = number of transposes*/
 738 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
 739 /*    int direction = plan->direction;
 740     int fftorder = plan->fftorder;*/
 741
 742     int o;
 743     int pos[3],i;
 744     int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
 745         *C=plan->C, *rC=plan->rC;
 746
 747     NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
 748
 749     if (!(plan->flags&FFT5D_ORDER_YZ)) {
 750         switch (s) {
 751         case 0: o=XYZ; break;
 752         case 1: o=ZYX; break;
 753         case 2: o=YZX; break;
 754         default: assert(0);
 755         }
 756     } else {
 757         switch (s) {
 758         case 0: o=XYZ; break;
 759         case 1: o=YXZ; break;
 760         case 2: o=ZXY; break;
 761         default: assert(0);
 762         }
 763     }
 764
 765     switch (o) {
 766         case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
 767         case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
 768         case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
 769         case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
 770         case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
 771         case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
 772     }
 773     /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
 774
 775     /*xs, xl give dimension size and data length in local transposed coordinate system
 776       for 0(/1/2): x(/y/z) in original coordinate system*/
 777     for (i=0;i<3;i++) {
 778         switch (pos[i]) {
 779         case 1: xs[i]=1;         xc[i]=0;     xl[i]=C[s];break;
 780         case 2: xs[i]=C[s];      xc[i]=oM[s]; xl[i]=pM[s];break;
 781         case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
 782         }
 783     }
 784     /*input order is different for test program to match FFTW order
 785       (important for complex to real)*/
 786     if (plan->flags&FFT5D_BACKWARD) {
 787         rotate(xs);
 788         rotate(xl);
 789         rotate(xc);
 790         rotate(NG);
 791         if (plan->flags&FFT5D_ORDER_YZ) {
 792             rotate(xs);
 793             rotate(xl);
 794             rotate(xc);
 795             rotate(NG);
 796         }
 797     }
 798     if (plan->flags&FFT5D_REALCOMPLEX && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || (plan->flags&FFT5D_BACKWARD && s==2))) {
 799         xl[0] = rC[s];
 800     }
 801 }
 802
 803 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
 804     int x,y,z,l;
 805     int *coor = plan->coor;
 806     int xs[3],xl[3],xc[3],NG[3];
 807     int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
 808     compute_offsets(plan,xs,xl,xc,NG,s);
 809     fprintf(debug,txt,coor[0],coor[1],s);
 810     /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
 811     for(z=0;z<xl[2];z++) {
 812         for(y=0;y<xl[1];y++) {
 813             fprintf(debug,"%d %d: ",coor[0],coor[1]);
 814             for (x=0;x<xl[0];x++) {
 815                 for (l=0;l<ll;l++) {
 816                     fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
 817                 }
 818                 fprintf(debug,",");
 819             }
 820             fprintf(debug,"\n");
 821         }
 822     }
 823 }
 824
 825 void fft5d_execute(fft5d_plan plan,fft5d_time times) {
 826     t_complex *lin = plan->lin;
 827     t_complex *lout = plan->lout;
 828
 829     gmx_fft_t *p1d=plan->p1d;
 830 #ifdef FFT5D_MPI_TRANSPOSE
 831     FFTW(plan) *mpip=plan->mpip;
 832 #endif
 833 #ifdef GMX_MPI
 834     MPI_Comm *cart=plan->cart;
 835 #endif
 836
 837     double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
 838     int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
 839         *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
 840     int s=0;
 841
 842
 843 #ifdef GMX_FFT_FFTW3
 844     if (plan->p3d) {
 845         if (times!=0)
 846             time=MPI_Wtime();
 847         FFTW(execute)(plan->p3d);
 848         if (times!=0)
 849             times->fft+=MPI_Wtime()-time;
 850         return;
 851     }
 852 #endif
 853
 854     /*lin: x,y,z*/
 855     if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: copy in lin\n", s, plan);
 856     for (s=0;s<2;s++) {
 857         if (times!=0)
 858             time=MPI_Wtime();
 859
 860         if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0) {
 861             gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
 862         } else {
 863             gmx_fft_many_1d(     p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin,lout);
 864         }
 865         if (times!=0)
 866             time_fft+=MPI_Wtime()-time;
 867
 868         if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
 869
 870 #ifdef GMX_MPI
 871         if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] !=0 && P[s]>1 )
 872         {
 873             if (times!=0)
 874                 time=MPI_Wtime();
 875             /*prepare for AllToAll
 876               1. (most outer) axes (x) is split into P[s] parts of size N[s]
 877               for sending*/
 878             splitaxes(lin,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s]);
 879
 880             if (times!=0)
 881             {
 882                 time_local+=MPI_Wtime()-time;
 883
 884                 /*send, recv*/
 885                 time=MPI_Wtime();
 886             }
 887
 888 #ifdef FFT5D_MPI_TRANSPOSE
 889             FFTW(execute)(mpip[s]);
 890 #else
 891             if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
 892                 MPI_Alltoall(lin,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
 893             else
 894                 MPI_Alltoall(lin,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
 895 #endif /*FFT5D_MPI_TRANSPOSE*/
 896             if (times!=0)
 897                 time_mpi[s]=MPI_Wtime()-time;
 898         }
 899 #endif /*GMX_MPI*/
 900
 901
 902         if (times!=0)
 903             time=MPI_Wtime();
 904         /*bring back in matrix form
 905           thus make  new 1. axes contiguos
 906           also local transpose 1 and 2/3 */
 907         if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
 908             joinAxesTrans13(lin,lout,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
 909         else
 910             joinAxesTrans12(lin,lout,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
 911         if (times!=0)
 912             time_local+=MPI_Wtime()-time;
 913
 914         if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
 915
 916         /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
 917     }
 918
 919     if (times!=0)
 920         time=MPI_Wtime();
 921     if (plan->flags&FFT5D_INPLACE) lout=lin;
 922     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
 923         gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
 924     } else {
 925         gmx_fft_many_1d(     p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin,lout);
 926     }
 927
 928     if (times!=0)
 929         time_fft+=MPI_Wtime()-time;
 930     if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
 931     /*if (debug) print_localdata(lout, "%d %d: FFT in y\n", N1, M, K0, YZX, coor);*/
 932
 933     if (times!=0) {
 934         times->fft+=time_fft;
 935         times->local+=time_local;
 936         times->mpi2+=time_mpi[1];
 937         times->mpi1+=time_mpi[0];
 938     }
 939 }
 940
 941 void fft5d_destroy(fft5d_plan plan) {
 942     int s;
 943     for (s=0;s<3;s++) {
 944         gmx_many_fft_destroy(plan->p1d[s]);
 945         if (plan->iNin[s]) {
 946             free(plan->iNin[s]);
 947             plan->iNin[s]=0;
 948         }
 949         if (plan->oNin[s]) {
 950             free(plan->oNin[s]);
 951             plan->oNin[s]=0;
 952         }
 953         if (plan->iNout[s]) {
 954             free(plan->iNout[s]);
 955             plan->iNout[s]=0;
 956         }
 957         if (plan->oNout[s]) {
 958             free(plan->oNout[s]);
 959             plan->oNout[s]=0;
 960         }
 961     }
 962 #ifdef GMX_FFT_FFTW3
 963     FFTW_LOCK;
 964 #ifdef FFT5D_MPI_TRANSPOS
 965     for (s=0;s<2;s++)
 966         FFTW(destroy_plan)(plan->mpip[s]);
 967 #endif /* FFT5D_MPI_TRANSPOS */
 968 #endif /* GMX_FFT_FFTW3 */
 969
 970     /*We can't free lin/lout here - is allocated by gmx_calloc_aligned which can't be freed*/
 971
 972
 973 #ifdef FFT5D_THREADS
 974 #ifdef FFT5D_FFTW_THREADS
 975     FFTW(cleanup_threads)();
 976 #endif
 977 #endif
 978
 979     free(plan);
 980 }
 981
 982 /*Is this better than direct access of plan? enough data?
 983   here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
 984 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
 985     *N1=plan->N[0];
 986     *M0=plan->M[0];
 987     *K1=plan->K[0];
 988     *K0=plan->N[1];
 989
 990     *coor=plan->coor;
 991 }
 992
 993
 994 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
 995   of processor dimensions*/
 996 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout) {
 997     MPI_Comm cart[2]={0};
 998 #ifdef GMX_MPI
 999     int size=1,prank=0;
1000     int P[2];
1001     int coor[2];
1002     int wrap[]={0,0};
1003     MPI_Comm gcart;
1004     int rdim1[] = {0,1}, rdim2[] = {1,0};
1005
1006     MPI_Comm_size(comm,&size);
1007     MPI_Comm_rank(comm,&prank);
1008
1009     if (P0==0) P0 = lfactor(size);
1010     if (size%P0!=0) {
1011         if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
1012         P0 = lfactor(size);
1013     }
1014
1015     P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
1016
1017     /*Difference between x-y-z regarding 2d decomposition is whether they are
1018       distributed along axis 1, 2 or both*/
1019
1020     MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
1021     MPI_Cart_get(gcart,2,P,wrap,coor);
1022     MPI_Cart_sub(gcart, rdim1 , &cart[0]);
1023     MPI_Cart_sub(gcart, rdim2 , &cart[1]);
1024 #endif
1025     return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout);
1026 }
1027
1028
1029
1030 /*prints in original coordinate system of data (as the input to FFT)*/
1031 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
1032     int xs[3],xl[3],xc[3],NG[3];
1033     int x,y,z,l;
1034     int *coor = plan->coor;
1035     int ll=2; /*compare ll values per element (has to be 2 for complex)*/
1036     if (plan->flags&FFT5D_REALCOMPLEX && plan->flags&FFT5D_BACKWARD)
1037     {
1038         ll=1;
1039     }
1040
1041     compute_offsets(plan,xs,xl,xc,NG,2);
1042     if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
1043     for (z=0;z<xl[2];z++) {
1044         for(y=0;y<xl[1];y++) {
1045             if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1046             for (x=0;x<xl[0];x++) {
1047                 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1048                     real a,b;
1049                     a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1050                     if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1051                     if (!bothLocal)
1052                         b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1053                     else
1054                         b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1055                     if (plan->flags&FFT5D_DEBUG) {
1056                         printf("%f %f, ",a,b);
1057                     } else {
1058                         if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1059                             printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1060                         }
1061 /*                        assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1062                     }
1063                 }
1064                 if (plan->flags&FFT5D_DEBUG) printf(",");
1065             }
1066             if (plan->flags&FFT5D_DEBUG) printf("\n");
1067         }
1068     }
1069
1070 }
1071