src/mdlib/fft5d.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  * $Id: gmx_matrix.c,v 1.4 2008/12/02 18:27:57 spoel Exp $
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  *                        VERSION 4.5
  11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  13  * Copyright (c) 2001-2008, The GROMACS development team,
  14  * check out http://www.gromacs.org for more information.
  15
  16  * This program is free software; you can redistribute it and/or
  17  * modify it under the terms of the GNU General Public License
  18  * as published by the Free Software Foundation; either version 2
  19  * of the License, or (at your option) any later version.
  20  *
  21  * If you want to redistribute modifications, please consider that
  22  * scientific software is very special. Version control is crucial -
  23  * bugs must be traceable. We will be happy to consider code for
  24  * inclusion in the official distribution, but derived work must not
  25  * be called official GROMACS. Details are found in the README & COPYING
  26  * files - if they are missing, get the official version at www.gromacs.org.
  27  *
  28  * To help us fund GROMACS development, we humbly ask that you cite
  29  * the papers on the package - you can find them in the top README file.
  30  *
  31  * For more info, check our website at http://www.gromacs.org
  32  *
  33  * And Hey:
  34  * Groningen Machine for Chemical Simulation
  35  */
  36 #ifdef HAVE_CONFIG_H
  37 #include <config.h>
  38 #endif
  39
  40
  41 #include <stdio.h>
  42 #include <stdlib.h>
  43 #include <string.h>
  44
  45 #ifdef NOGMX
  46 #define GMX_PARALLEL_ENV_INITIALIZED 1
  47 #else
  48 #include "main.h"
  49 #define GMX_PARALLEL_ENV_INITIALIZED gmx_parallel_env_initialized()
  50 #endif
  51
  52 #ifdef GMX_LIB_MPI
  53 #include <mpi.h>
  54 #endif
  55 #ifdef GMX_THREADS
  56 #include "tmpi.h"
  57 #endif
  58
  59 #ifdef FFT5D_THREADS
  60 #include <omp.h>
  61 /* requires fftw compiled with openmp */
  62 #define FFT5D_FFTW_THREADS
  63 #endif
  64
  65 #include "fft5d.h"
  66 #include <float.h>
  67 #include <math.h>
  68 #include <assert.h>
  69
  70 #ifndef __FLT_EPSILON__
  71 #define __FLT_EPSILON__ FLT_EPSILON
  72 #define __DBL_EPSILON__ DBL_EPSILON
  73 #endif
  74
  75 #ifdef NOGMX
  76 FILE* debug=0;
  77 #endif
  78
  79 #include "gmx_fatal.h"
  80
  81
  82 #ifdef GMX_FFT_FFTW3
  83 #ifdef GMX_THREADS
  84 /* none of the fftw3 calls, except execute(), are thread-safe, so
  85    we need to serialize them with this mutex. */
  86 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
  87
  88 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex);
  89 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex);
  90 #else /* GMX_THREADS */
  91 #define FFTW_LOCK
  92 #define FFTW_UNLOCK
  93 #endif /* GMX_THREADS */
  94 #endif /* GMX_FFT_FFTW3 */
  95
  96 static double fft5d_fmax(double a, double b){
  97         return (a>b)?a:b;
  98 }
  99
 100 /* largest factor smaller than sqrt */
 101 static int lfactor(int z) {
 102         int i;
 103         for (i=sqrt(z);;i--)
 104                 if (z%i==0) return i;
 105 }
 106
 107 /* largest factor */
 108 static int l2factor(int z) {
 109         int i;
 110         if (z==1) return 1;
 111         for (i=z/2;;i--)
 112                 if (z%i==0) return i;
 113 }
 114
 115 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
 116 static int lpfactor(int z) {
 117         int f = l2factor(z);
 118         if (f==1) return z;
 119         return fft5d_fmax(lpfactor(f),lpfactor(z/f));
 120 }
 121
 122 #ifndef GMX_MPI
 123 #ifdef HAVE_GETTIMEOFDAY
 124 #include <sys/time.h>
 125 double MPI_Wtime() {
 126     struct timeval tv;
 127     gettimeofday(&tv,0);
 128     return tv.tv_sec+tv.tv_usec*1e-6;
 129 }
 130 #else
 131 double MPI_Wtime() {
 132     return 0.0;
 133 }
 134 #endif
 135 #endif
 136
 137 static int vmax(int* a, int s) {
 138     int i,max=0;
 139     for (i=0;i<s;i++)
 140     {
 141         if (a[i]>max) max=a[i];
 142     }
 143     return max;
 144 }
 145
 146 /*
 147 copied here from fftgrid, because:
 148 1. function there not publically available
 149 2. not sure whether we keep fftgrid
 150 3. less dependencies for fft5d
 151
 152 Only used for non-fftw case
 153 */
 154 static void *
 155 gmx_calloc_aligned(size_t size)
 156 {
 157     void *p0,*p;
 158
 159     /*We initialize by zero for Valgrind
 160       For non-divisible case we communicate more than the data.
 161       If we don't initialize the data we communicate uninitialized data*/
 162     p0 = calloc(size+32,1);
 163
 164     if(p0 == NULL)
 165     {
 166         gmx_fatal(FARGS,"Failed to allocated %u bytes of aligned memory.",size+32);
 167     }
 168
 169     p = (void *) (((size_t) p0 + 32) & (~((size_t) 31)));
 170
 171     /* Yeah, yeah, we cannot free this pointer, but who cares... */
 172     return p;
 173 }
 174
 175
 176 /* NxMxK the size of the data
 177  * comm communicator to use for fft5d
 178  * P0 number of processor in 1st axes (can be null for automatic)
 179  * lin is allocated by fft5d because size of array is only known after planning phase */
 180 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout)
 181 {
 182
 183     int P[2],bMaster,prank[2],i;
 184     int rNG,rMG,rKG;
 185     int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
 186     int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
 187     int C[3],rC[3],nP[2];
 188     int lsize;
 189     t_complex *lin=0,*lout=0;
 190     fft5d_plan plan;
 191     int s;
 192
 193     /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
 194 #ifdef GMX_MPI
 195     if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != 0)
 196     {
 197         MPI_Comm_size(comm[0],&P[0]);
 198         MPI_Comm_rank(comm[0],&prank[0]);
 199     }
 200     else
 201 #endif
 202     {
 203         P[0] = 1;
 204         prank[0] = 0;
 205     }
 206 #ifdef GMX_MPI
 207     if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != 0)
 208     {
 209         MPI_Comm_size(comm[1],&P[1]);
 210         MPI_Comm_rank(comm[1],&prank[1]);
 211     }
 212     else
 213 #endif
 214     {
 215         P[1] = 1;
 216         prank[1] = 0;
 217     }
 218
 219     bMaster=(prank[0]==0&&prank[1]==0);
 220
 221
 222     if (debug)
 223     {
 224         fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
 225                 P[0],P[1],prank[0],prank[1]);
 226     }
 227
 228     if (bMaster) {
 229         if (debug)
 230             fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
 231                 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
 232         /* The check below is not correct, one prime factor 11 or 13 is ok.
 233         if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
 234             printf("WARNING: FFT very slow with prime factors larger 7\n");
 235             printf("Change FFT size or in case you cannot change it look at\n");
 236             printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
 237         }
 238         */
 239     }
 240
 241     if (NG==0 || MG==0 || KG==0) {
 242         if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
 243         return 0;
 244     }
 245
 246     rNG=NG;rMG=MG;rKG=KG;
 247
 248     if (flags&FFT5D_REALCOMPLEX) {
 249         if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
 250         else {
 251             if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
 252             else KG=KG/2+1;
 253         }
 254     }
 255
 256
 257     /*for transpose we need to know the size for each processor not only our own size*/
 258
 259     N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
 260     M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
 261     K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
 262     oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
 263     oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
 264     oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
 265
 266     for (i=0;i<P[0];i++)
 267     {
 268         #define EVENDIST
 269         #ifndef EVENDIST
 270         oN0[i]=i*ceil((double)NG/P[0]);
 271         oM0[i]=i*ceil((double)MG/P[0]);
 272         oK0[i]=i*ceil((double)KG/P[0]);
 273         #else
 274         oN0[i]=(NG*i)/P[0];
 275         oM0[i]=(MG*i)/P[0];
 276         oK0[i]=(KG*i)/P[0];
 277         #endif
 278     }
 279     for (i=0;i<P[1];i++)
 280     {
 281         #ifndef EVENDIST
 282         oN1[i]=i*ceil((double)NG/P[1]);
 283         oM1[i]=i*ceil((double)MG/P[1]);
 284         oK1[i]=i*ceil((double)KG/P[1]);
 285         #else
 286         oN1[i]=(NG*i)/P[1];
 287         oM1[i]=(MG*i)/P[1];
 288         oK1[i]=(KG*i)/P[1];
 289         #endif
 290     }
 291     for (i=0;i<P[0]-1;i++)
 292     {
 293         N0[i]=oN0[i+1]-oN0[i];
 294         M0[i]=oM0[i+1]-oM0[i];
 295         K0[i]=oK0[i+1]-oK0[i];
 296     }
 297     N0[P[0]-1]=NG-oN0[P[0]-1];
 298     M0[P[0]-1]=MG-oM0[P[0]-1];
 299     K0[P[0]-1]=KG-oK0[P[0]-1];
 300     for (i=0;i<P[1]-1;i++)
 301     {
 302         N1[i]=oN1[i+1]-oN1[i];
 303         M1[i]=oM1[i+1]-oM1[i];
 304         K1[i]=oK1[i+1]-oK1[i];
 305     }
 306     N1[P[1]-1]=NG-oN1[P[1]-1];
 307     M1[P[1]-1]=MG-oM1[P[1]-1];
 308     K1[P[1]-1]=KG-oK1[P[1]-1];
 309
 310     /* for step 1-3 the local N,M,K sizes of the transposed system
 311        C: contiguous dimension, and nP: number of processor in subcommunicator
 312        for that step */
 313
 314
 315     pM[0] = M0[prank[0]];
 316     oM[0] = oM0[prank[0]];
 317     pK[0] = K1[prank[1]];
 318     oK[0] = oK1[prank[1]];
 319     C[0] = NG;
 320     rC[0] = rNG;
 321     if (!(flags&FFT5D_ORDER_YZ)) {
 322         N[0] = vmax(N1,P[1]);
 323         M[0] = M0[prank[0]];
 324         K[0] = vmax(K1,P[1]);
 325         pN[0] = N1[prank[1]];
 326         iNout[0] = N1;
 327         oNout[0] = oN1;
 328         nP[0] = P[1];
 329         C[1] = KG;
 330         rC[1] =rKG;
 331         N[1] = vmax(K0,P[0]);
 332         pN[1] = K0[prank[0]];
 333         iNin[1] = K1;
 334         oNin[1] = oK1;
 335         iNout[1] = K0;
 336         oNout[1] = oK0;
 337         M[1] = vmax(M0,P[0]);
 338         pM[1] = M0[prank[0]];
 339         oM[1] = oM0[prank[0]];
 340         K[1] = N1[prank[1]];
 341         pK[1] = N1[prank[1]];
 342         oK[1] = oN1[prank[1]];
 343         nP[1] = P[0];
 344         C[2] = MG;
 345         rC[2] = rMG;
 346         iNin[2] = M0;
 347         oNin[2] = oM0;
 348         M[2] = vmax(K0,P[0]);
 349         pM[2] = K0[prank[0]];
 350         oM[2] = oK0[prank[0]];
 351         K[2] = vmax(N1,P[1]);
 352         pK[2] = N1[prank[1]];
 353         oK[2] = oN1[prank[1]];
 354         free(N0); free(oN0); /*these are not used for this order*/
 355         free(M1); free(oM1); /*the rest is freed in destroy*/
 356     } else {
 357         N[0] = vmax(N0,P[0]);
 358         M[0] = vmax(M0,P[0]);
 359         K[0] = K1[prank[1]];
 360         pN[0] = N0[prank[0]];
 361         iNout[0] = N0;
 362         oNout[0] = oN0;
 363         nP[0] = P[0];
 364         C[1] = MG;
 365         rC[1] =rMG;
 366         N[1] = vmax(M1,P[1]);
 367         pN[1] = M1[prank[1]];
 368         iNin[1] = M0;
 369         oNin[1] = oM0;
 370         iNout[1] = M1;
 371         oNout[1] = oM1;
 372         M[1] = N0[prank[0]];
 373         pM[1] = N0[prank[0]];
 374         oM[1] = oN0[prank[0]];
 375         K[1] = vmax(K1,P[1]);
 376         pK[1] = K1[prank[1]];
 377         oK[1] = oK1[prank[1]];
 378         nP[1] = P[1];
 379         C[2] = KG;
 380         rC[2] = rKG;
 381         iNin[2] = K1;
 382         oNin[2] = oK1;
 383         M[2] = vmax(N0,P[0]);
 384         pM[2] = N0[prank[0]];
 385         oM[2] = oN0[prank[0]];
 386         K[2] = vmax(M1,P[1]);
 387         pK[2] = M1[prank[1]];
 388         oK[2] = oM1[prank[1]];
 389         free(N1); free(oN1); /*these are not used for this order*/
 390         free(K0); free(oK0); /*the rest is freed in destroy*/
 391     }
 392
 393     /*
 394       Difference between x-y-z regarding 2d decomposition is whether they are
 395       distributed along axis 1, 2 or both
 396     */
 397
 398     /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
 399     lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
 400     /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
 401     if (!(flags&FFT5D_NOMALLOC)) {
 402         lin = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
 403         lout = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
 404     } else {
 405         lin = *rlin;
 406         lout = *rlout;
 407     }
 408
 409     plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
 410
 411
 412 #ifdef FFT5D_THREADS
 413 #ifdef FFT5D_FFTW_THREADS
 414     FFTW(init_threads)();
 415     int nthreads;
 416     #pragma omp parallel
 417     {
 418         #pragma omp master
 419         {
 420             nthreads = omp_get_num_threads();
 421         }
 422     }
 423     if (prank[0] == 0 && prank[1] == 0)
 424     {
 425         printf("Running fftw on %d threads\n",nthreads);
 426     }
 427     FFTW(plan_with_nthreads)(nthreads);
 428 #endif
 429 #endif
 430
 431 #ifdef GMX_FFT_FFTW3  /*if not FFTW - then we don't do a 3d plan but insead only 1D plans */
 432     if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1))) {  /*don't do 3d plan in parallel or if in_place requested */
 433             int fftwflags=FFTW_DESTROY_INPUT;
 434             fftw_iodim dims[3];
 435             int inNG=NG,outMG=MG,outKG=KG;
 436
 437             FFTW_LOCK;
 438             if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
 439             if (flags&FFT5D_REALCOMPLEX) {
 440                 if (!(flags&FFT5D_BACKWARD)) {  /*input pointer is not complex*/
 441                     inNG*=2;
 442                 } else {                        /*output pointer is not complex*/
 443                     if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
 444                     else outKG*=2;
 445                 }
 446             }
 447
 448             if (!(flags&FFT5D_BACKWARD)) {
 449                 dims[0].n  = KG;
 450                 dims[1].n  = MG;
 451                 dims[2].n  = rNG;
 452
 453                 dims[0].is = inNG*MG;     /*N M K*/
 454                 dims[1].is = inNG;
 455                 dims[2].is = 1;
 456                 if (!(flags&FFT5D_ORDER_YZ)) {
 457                     dims[0].os = MG;       /*M K N*/
 458                     dims[1].os = 1;
 459                     dims[2].os = MG*KG;
 460                 } else  {
 461                     dims[0].os = 1;       /*K N M*/
 462                     dims[1].os = KG*NG;
 463                     dims[2].os = KG;
 464                 }
 465             } else {
 466                 if (!(flags&FFT5D_ORDER_YZ)) {
 467                     dims[0].n  = NG;
 468                     dims[1].n  = KG;
 469                     dims[2].n  = rMG;
 470
 471                     dims[0].is = 1;
 472                     dims[1].is = NG*MG;
 473                     dims[2].is = NG;
 474
 475                     dims[0].os = outMG*KG;
 476                     dims[1].os = outMG;
 477                     dims[2].os = 1;
 478                 } else {
 479                     dims[0].n  = MG;
 480                     dims[1].n  = NG;
 481                     dims[2].n  = rKG;
 482
 483                     dims[0].is = NG;
 484                     dims[1].is = 1;
 485                     dims[2].is = NG*MG;
 486
 487                     dims[0].os = outKG*NG;
 488                     dims[1].os = outKG;
 489                     dims[2].os = 1;
 490                 }
 491             }
 492             if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
 493                 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
 494                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 495                                      (real*)lin, (FFTW(complex) *)lout,
 496                                      /*flags*/ fftwflags);
 497             } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
 498                 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
 499                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 500                                      (FFTW(complex) *)lin, (real*)lout,
 501                                      /*flags*/ fftwflags);
 502             } else {
 503                 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
 504                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 505                                      (FFTW(complex) *)lin, (FFTW(complex) *)lout,
 506                                      /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
 507             }
 508             FFTW_UNLOCK;
 509     }
 510     if (!plan->p3d) {  /* for decomposition and if 3d plan did not work */
 511 #endif /* GMX_FFT_FFTW3 */
 512         for (s=0;s<3;s++) {
 513             if (debug)
 514             {
 515                 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
 516                         s,rC[s],M[s],pK[s],C[s],lsize);
 517             }
 518             if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
 519                 gmx_fft_init_many_1d_real( &plan->p1d[s], rC[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 520             } else {
 521                 gmx_fft_init_many_1d     ( &plan->p1d[s],  C[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 522             }
 523         }
 524 #ifdef GMX_FFT_FFTW3
 525     }
 526 #endif
 527     if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
 528         plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
 529     } else {
 530         plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
 531     }
 532 #ifdef FFT5D_MPI_TRANSPOSE
 533     FFTW_LOCK
 534     for (s=0;s<2;s++) {
 535         if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
 536             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
 537         else
 538             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
 539     }
 540     FFTW_UNLOCK
 541 #endif
 542
 543
 544     plan->lin=lin;
 545     plan->lout=lout;
 546
 547     plan->NG=NG;plan->MG=MG;plan->KG=KG;
 548
 549     for (s=0;s<3;s++) {
 550         plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
 551         plan->oM[s]=oM[s];plan->oK[s]=oK[s];
 552         plan->C[s]=C[s];plan->rC[s]=rC[s];
 553         plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
 554     }
 555     for (s=0;s<2;s++) {
 556         plan->P[s]=nP[s];plan->coor[s]=prank[s];
 557     }
 558
 559 /*    plan->fftorder=fftorder;
 560     plan->direction=direction;
 561     plan->realcomplex=realcomplex;
 562 */
 563     plan->flags=flags;
 564     *rlin=lin;
 565     *rlout=lout;
 566     return plan;
 567 }
 568
 569
 570 enum order {
 571     XYZ,
 572     XZY,
 573     YXZ,
 574     YZX,
 575     ZXY,
 576     ZYX
 577 };
 578
 579
 580
 581 /*here x,y,z and N,M,K is in rotated coordinate system!!
 582   x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
 583   maxN,maxM,maxK is max size of local data
 584   pN, pM, pK is local size specific to current processor (only different to max if not divisible)
 585   NG, MG, KG is size of global data*/
 586 static void splitaxes(t_complex* lout,const t_complex* lin,
 587                       int maxN,int maxM,int maxK, int pN, int pM, int pK,
 588                       int P,int NG,int *N, int* oN) {
 589     int x,y,z,i;
 590     int in_i,out_i,in_z,out_z,in_y,out_y;
 591
 592     for (i=0;i<P;i++) { /*index cube along long axis*/
 593         in_i  = i*maxN*maxM*maxK;
 594         out_i = oN[i];
 595 #ifdef FFT5D_THREADS
 596 #pragma omp parallel for private(in_z,out_z,y,in_y,out_y,x)
 597 #endif
 598         for (z=0;z<pK;z++) { /*3. z l*/
 599             in_z  = in_i  + z*maxN*maxM;
 600             out_z = out_i + z*NG*pM;
 601             for (y=0;y<pM;y++) { /*2. y k*/
 602                 in_y  = in_z  + y*maxN;
 603                 out_y = out_z + y*NG;
 604                 for (x=0;x<N[i];x++) { /*1. x j*/
 605                     lout[in_y+x] = lin[out_y+x];
 606                     /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
 607                     /*before split data contiguos - thus if different processor get different amount oN is different*/
 608                 }
 609             }
 610         }
 611     }
 612 }
 613
 614 /*make axis contiguous again (after AllToAll) and also do local transpose*/
 615 /*transpose mayor and major dimension
 616   variables see above
 617   the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
 618   N,M,K local dimensions
 619   KG global size*/
 620 static void joinAxesTrans13(t_complex* lin,const t_complex* lout,
 621                             int maxN,int maxM,int maxK,int pN, int pM, int pK,
 622                             int P,int KG, int* K, int* oK)
 623 {
 624     int i,x,y,z;
 625     int in_i,out_i,in_x,out_x,in_z,out_z;
 626
 627     for (i=0;i<P;i++) { /*index cube along long axis*/
 628         in_i  = oK[i];
 629         out_i = i*maxM*maxN*maxK;
 630 #ifdef FFT5D_THREADS
 631 #pragma omp parallel for private(in_x,out_x,z,in_z,out_z,y)
 632 #endif
 633         for (x=0;x<pN;x++) { /*1.j*/
 634             in_x  = in_i  + x*KG*pM;
 635             out_x = out_i + x;
 636             for (z=0;z<K[i];z++) { /*3.l*/
 637                 in_z  = in_x  + z;
 638                 out_z = out_x + z*maxM*maxN;
 639                 for (y=0;y<pM;y++) { /*2.k*/
 640                     lin[in_z+y*KG] = lout[out_z+y*maxN];
 641                 }
 642             }
 643         }
 644     }
 645 }
 646
 647 /*make axis contiguous again (after AllToAll) and also do local transpose
 648   tranpose mayor and middle dimension
 649   variables see above
 650   the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
 651   N,M,K local size
 652   MG, global size*/
 653 static void joinAxesTrans12(t_complex* lin,const t_complex* lout,int maxN,int maxM,int maxK,int pN, int pM, int pK,
 654                 int P,int MG, int* M, int* oM) {
 655     int i,z,y,x;
 656     int in_i,out_i,in_z,out_z,in_x,out_x;
 657
 658     for (i=0;i<P;i++) { /*index cube along long axis*/
 659         in_i  = oM[i];
 660         out_i = i*maxM*maxN*maxK;
 661 #ifdef FFT5D_THREADS
 662 #pragma omp parallel for private(in_z,out_z,in_x,out_x,x,y)
 663 #endif
 664         for (z=0;z<pK;z++) {
 665             in_z  = in_i  + z*MG*pN;
 666             out_z = out_i + z*maxM*maxN;
 667             for (x=0;x<pN;x++) {
 668                 in_x  = in_z  + x*MG;
 669                 out_x = out_z + x;
 670                 for (y=0;y<M[i];y++) {
 671                     lin[in_x+y] = lout[out_x+y*maxN];
 672                 }
 673             }
 674         }
 675     }
 676 }
 677
 678
 679 static void rotate(int x[]) {
 680     int t=x[0];
 681 /*    x[0]=x[2];
 682     x[2]=x[1];
 683     x[1]=t;*/
 684     x[0]=x[1];
 685     x[1]=x[2];
 686     x[2]=t;
 687 }
 688
 689 /*compute the offset to compare or print transposed local data in original input coordinates
 690   xs matrix dimension size, xl dimension length, xc decomposition offset
 691   s: step in computation = number of transposes*/
 692 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
 693 /*    int direction = plan->direction;
 694     int fftorder = plan->fftorder;*/
 695
 696     int o;
 697     int pos[3],i;
 698     int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
 699         *C=plan->C, *rC=plan->rC;
 700
 701     NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
 702
 703     if (!(plan->flags&FFT5D_ORDER_YZ)) {
 704         switch (s) {
 705         case 0: o=XYZ; break;
 706         case 1: o=ZYX; break;
 707         case 2: o=YZX; break;
 708         default: assert(0);
 709         }
 710     } else {
 711         switch (s) {
 712         case 0: o=XYZ; break;
 713         case 1: o=YXZ; break;
 714         case 2: o=ZXY; break;
 715         default: assert(0);
 716         }
 717     }
 718
 719     switch (o) {
 720         case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
 721         case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
 722         case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
 723         case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
 724         case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
 725         case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
 726     }
 727     /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
 728
 729     /*xs, xl give dimension size and data length in local transposed coordinate system
 730       for 0(/1/2): x(/y/z) in original coordinate system*/
 731     for (i=0;i<3;i++) {
 732         switch (pos[i]) {
 733         case 1: xs[i]=1;         xc[i]=0;     xl[i]=C[s];break;
 734         case 2: xs[i]=C[s];      xc[i]=oM[s]; xl[i]=pM[s];break;
 735         case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
 736         }
 737     }
 738     /*input order is different for test program to match FFTW order
 739       (important for complex to real)*/
 740     if (plan->flags&FFT5D_BACKWARD) {
 741         rotate(xs);
 742         rotate(xl);
 743         rotate(xc);
 744         rotate(NG);
 745         if (plan->flags&FFT5D_ORDER_YZ) {
 746             rotate(xs);
 747             rotate(xl);
 748             rotate(xc);
 749             rotate(NG);
 750         }
 751     }
 752     if (plan->flags&FFT5D_REALCOMPLEX && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || (plan->flags&FFT5D_BACKWARD && s==2))) {
 753         xl[0] = rC[s];
 754     }
 755 }
 756
 757 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
 758     int x,y,z,l;
 759     int *coor = plan->coor;
 760     int xs[3],xl[3],xc[3],NG[3];
 761     int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
 762     compute_offsets(plan,xs,xl,xc,NG,s);
 763     fprintf(debug,txt,coor[0],coor[1],s);
 764     /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
 765     for(z=0;z<xl[2];z++) {
 766         for(y=0;y<xl[1];y++) {
 767             fprintf(debug,"%d %d: ",coor[0],coor[1]);
 768             for (x=0;x<xl[0];x++) {
 769                 for (l=0;l<ll;l++) {
 770                     fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
 771                 }
 772                 fprintf(debug,",");
 773             }
 774             fprintf(debug,"\n");
 775         }
 776     }
 777 }
 778
 779 void fft5d_execute(fft5d_plan plan,fft5d_time times) {
 780     t_complex *lin = plan->lin;
 781     t_complex *lout = plan->lout;
 782
 783     gmx_fft_t *p1d=plan->p1d;
 784 #ifdef FFT5D_MPI_TRANSPOSE
 785     FFTW(plan) *mpip=plan->mpip;
 786 #endif
 787 #ifdef GMX_MPI
 788     MPI_Comm *cart=plan->cart;
 789 #endif
 790
 791     double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
 792     int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
 793         *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
 794     int s=0;
 795
 796
 797 #ifdef GMX_FFT_FFTW3
 798     if (plan->p3d) {
 799         if (times!=0)
 800             time=MPI_Wtime();
 801         FFTW(execute)(plan->p3d);
 802         if (times!=0)
 803             times->fft+=MPI_Wtime()-time;
 804         return;
 805     }
 806 #endif
 807
 808     /*lin: x,y,z*/
 809     if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: copy in lin\n", s, plan);
 810     for (s=0;s<2;s++) {
 811         if (times!=0)
 812             time=MPI_Wtime();
 813
 814         if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0) {
 815             gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
 816         } else {
 817             gmx_fft_many_1d(     p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin,lout);
 818         }
 819         if (times!=0)
 820             time_fft+=MPI_Wtime()-time;
 821
 822         if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
 823
 824 #ifdef GMX_MPI
 825         if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] !=0 && P[s]>1 )
 826         {
 827             if (times!=0)
 828                 time=MPI_Wtime();
 829             /*prepare for AllToAll
 830               1. (most outer) axes (x) is split into P[s] parts of size N[s]
 831               for sending*/
 832             splitaxes(lin,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s]);
 833
 834             if (times!=0)
 835             {
 836                 time_local+=MPI_Wtime()-time;
 837
 838                 /*send, recv*/
 839                 time=MPI_Wtime();
 840             }
 841
 842 #ifdef FFT5D_MPI_TRANSPOSE
 843             FFTW(execute)(mpip[s]);
 844 #else
 845             if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
 846                 MPI_Alltoall(lin,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
 847             else
 848                 MPI_Alltoall(lin,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
 849 #endif /*FFT5D_MPI_TRANSPOSE*/
 850             if (times!=0)
 851                 time_mpi[s]=MPI_Wtime()-time;
 852         }
 853 #endif /*GMX_MPI*/
 854
 855
 856         if (times!=0)
 857             time=MPI_Wtime();
 858         /*bring back in matrix form
 859           thus make  new 1. axes contiguos
 860           also local transpose 1 and 2/3 */
 861         if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
 862             joinAxesTrans13(lin,lout,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
 863         else
 864             joinAxesTrans12(lin,lout,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
 865         if (times!=0)
 866             time_local+=MPI_Wtime()-time;
 867
 868         if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
 869
 870         /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
 871     }
 872
 873     if (times!=0)
 874         time=MPI_Wtime();
 875     if (plan->flags&FFT5D_INPLACE) lout=lin;
 876     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
 877         gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
 878     } else {
 879         gmx_fft_many_1d(     p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin,lout);
 880     }
 881
 882     if (times!=0)
 883         time_fft+=MPI_Wtime()-time;
 884     if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
 885     /*if (debug) print_localdata(lout, "%d %d: FFT in y\n", N1, M, K0, YZX, coor);*/
 886
 887     if (times!=0) {
 888         times->fft+=time_fft;
 889         times->local+=time_local;
 890         times->mpi2+=time_mpi[1];
 891         times->mpi1+=time_mpi[0];
 892     }
 893 }
 894
 895 void fft5d_destroy(fft5d_plan plan) {
 896     int s;
 897     for (s=0;s<3;s++) {
 898         gmx_many_fft_destroy(plan->p1d[s]);
 899         if (plan->iNin[s]) {
 900             free(plan->iNin[s]);
 901             plan->iNin[s]=0;
 902         }
 903         if (plan->oNin[s]) {
 904             free(plan->oNin[s]);
 905             plan->oNin[s]=0;
 906         }
 907         if (plan->iNout[s]) {
 908             free(plan->iNout[s]);
 909             plan->iNout[s]=0;
 910         }
 911         if (plan->oNout[s]) {
 912             free(plan->oNout[s]);
 913             plan->oNout[s]=0;
 914         }
 915     }
 916 #ifdef GMX_FFT_FFTW3
 917     FFTW_LOCK;
 918 #ifdef FFT5D_MPI_TRANSPOS
 919     for (s=0;s<2;s++)
 920         FFTW(destroy_plan)(plan->mpip[s]);
 921 #endif /* FFT5D_MPI_TRANSPOS */
 922 #endif /* GMX_FFT_FFTW3 */
 923
 924     /*We can't free lin/lout here - is allocated by gmx_calloc_aligned which can't be freed*/
 925
 926
 927 #ifdef FFT5D_THREADS
 928 #ifdef FFT5D_FFTW_THREADS
 929     FFTW(cleanup_threads)();
 930 #endif
 931 #endif
 932
 933     free(plan);
 934 }
 935
 936 /*Is this better than direct access of plan? enough data?
 937   here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
 938 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
 939     *N1=plan->N[0];
 940     *M0=plan->M[0];
 941     *K1=plan->K[0];
 942     *K0=plan->N[1];
 943
 944     *coor=plan->coor;
 945 }
 946
 947
 948 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
 949   of processor dimensions*/
 950 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout) {
 951     MPI_Comm cart[2]={0};
 952 #ifdef GMX_MPI
 953     int size=1,prank=0;
 954     int P[2];
 955     int coor[2];
 956     int wrap[]={0,0};
 957     MPI_Comm gcart;
 958     int rdim1[] = {0,1}, rdim2[] = {1,0};
 959
 960     MPI_Comm_size(comm,&size);
 961     MPI_Comm_rank(comm,&prank);
 962
 963     if (P0==0) P0 = lfactor(size);
 964     if (size%P0!=0) {
 965         if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
 966         P0 = lfactor(size);
 967     }
 968
 969     P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
 970
 971     /*Difference between x-y-z regarding 2d decomposition is whether they are
 972       distributed along axis 1, 2 or both*/
 973
 974     MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
 975     MPI_Cart_get(gcart,2,P,wrap,coor);
 976     MPI_Cart_sub(gcart, rdim1 , &cart[0]);
 977     MPI_Cart_sub(gcart, rdim2 , &cart[1]);
 978 #endif
 979     return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout);
 980 }
 981
 982
 983
 984 /*prints in original coordinate system of data (as the input to FFT)*/
 985 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
 986     int xs[3],xl[3],xc[3],NG[3];
 987     int x,y,z,l;
 988     int *coor = plan->coor;
 989     int ll=2; /*compare ll values per element (has to be 2 for complex)*/
 990     if (plan->flags&FFT5D_REALCOMPLEX && plan->flags&FFT5D_BACKWARD)
 991     {
 992         ll=1;
 993     }
 994
 995     compute_offsets(plan,xs,xl,xc,NG,2);
 996     if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
 997     for (z=0;z<xl[2];z++) {
 998         for(y=0;y<xl[1];y++) {
 999             if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1000             for (x=0;x<xl[0];x++) {
1001                 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1002                     real a,b;
1003                     a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1004                     if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1005                     if (!bothLocal)
1006                         b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1007                     else
1008                         b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1009                     if (plan->flags&FFT5D_DEBUG) {
1010                         printf("%f %f, ",a,b);
1011                     } else {
1012                         if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1013                             printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1014                         }
1015 /*                        assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1016                     }
1017                 }
1018                 if (plan->flags&FFT5D_DEBUG) printf(",");
1019             }
1020             if (plan->flags&FFT5D_DEBUG) printf("\n");
1021         }
1022     }
1023
1024 }
1025