src/mdlib/fft5d.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  */
   3
   4 #ifdef HAVE_CONFIG_H
   5 #include <config.h>
   6 #endif
   7
   8 #include <stdio.h>
   9 #include <stdlib.h>
  10 #include <string.h>
  11
  12 #ifdef NOGMX
  13 #define GMX_PARALLEL_ENV_INITIALIZED 1
  14 #else
  15 #include "main.h"
  16 #define GMX_PARALLEL_ENV_INITIALIZED gmx_parallel_env_initialized()
  17 #endif
  18
  19 #ifdef GMX_LIB_MPI
  20 #include <mpi.h>
  21 #endif
  22 #ifdef GMX_THREADS
  23 #include "tmpi.h"
  24 #endif
  25
  26 #ifdef FFT5D_THREADS
  27 #include <omp.h>
  28 #endif
  29
  30 #include "fft5d.h"
  31 #include <float.h>
  32 #include <math.h>
  33 #include <assert.h>
  34
  35 #ifndef __FLT_EPSILON__
  36 #define __FLT_EPSILON__ FLT_EPSILON
  37 #define __DBL_EPSILON__ DBL_EPSILON
  38 #endif
  39
  40 #ifdef NOGMX
  41 FILE* debug=0;
  42 #endif
  43
  44 #include "gmx_fatal.h"
  45
  46
  47 #ifdef GMX_FFT_FFTW3
  48 #ifdef GMX_THREADS
  49 /* none of the fftw3 calls, except execute(), are thread-safe, so
  50    we need to serialize them with this mutex. */
  51 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
  52
  53 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex);
  54 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex);
  55 #else /* GMX_THREADS */
  56 #define FFTW_LOCK
  57 #define FFTW_UNLOCK
  58 #endif /* GMX_THREADS */
  59 #endif /* GMX_FFT_FFTW3 */
  60
  61 static double fft5d_fmax(double a, double b){
  62         return (a>b)?a:b;
  63 }
  64
  65 /* largest factor smaller than sqrt */
  66 static int lfactor(int z) {
  67         int i;
  68         for (i=sqrt(z);;i--)
  69                 if (z%i==0) return i;
  70 }
  71
  72 /* largest factor */
  73 static int l2factor(int z) {
  74         int i;
  75         if (z==1) return 1;
  76         for (i=z/2;;i--)
  77                 if (z%i==0) return i;
  78 }
  79
  80 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
  81 static int lpfactor(int z) {
  82         int f = l2factor(z);
  83         if (f==1) return z;
  84         return fft5d_fmax(lpfactor(f),lpfactor(z/f));
  85 }
  86
  87 #ifndef GMX_MPI
  88 #ifdef HAVE_GETTIMEOFDAY
  89 #include <sys/time.h>
  90 double MPI_Wtime() {
  91     struct timeval tv;
  92     gettimeofday(&tv,0);
  93     return tv.tv_sec+tv.tv_usec*1e-6;
  94 }
  95 #else
  96 double MPI_Wtime() {
  97     return 0.0;
  98 }
  99 #endif
 100 #endif
 101
 102 static int vmax(int* a, int s) {
 103     int i,max=0;
 104     for (i=0;i<s;i++)
 105     {
 106         if (a[i]>max) max=a[i];
 107     }
 108     return max;
 109 }
 110
 111 /*
 112 copied here from fftgrid, because:
 113 1. function there not publically available
 114 2. not sure whether we keep fftgrid
 115 3. less dependencies for fft5d
 116
 117 Only used for non-fftw case
 118 */
 119 static void *
 120 gmx_calloc_aligned(size_t size)
 121 {
 122     void *p0,*p;
 123
 124     /*We initialize by zero for Valgrind
 125       For non-divisible case we communicate more than the data.
 126       If we don't initialize the data we communicate uninitialized data*/
 127     p0 = calloc(size+32,1);
 128
 129     if(p0 == NULL)
 130     {
 131         gmx_fatal(FARGS,"Failed to allocated %u bytes of aligned memory.",size+32);
 132     }
 133
 134     p = (void *) (((size_t) p0 + 32) & (~((size_t) 31)));
 135
 136     /* Yeah, yeah, we cannot free this pointer, but who cares... */
 137     return p;
 138 }
 139
 140
 141 /* NxMxK the size of the data
 142  * comm communicator to use for fft5d
 143  * P0 number of processor in 1st axes (can be null for automatic)
 144  * lin is allocated by fft5d because size of array is only known after planning phase */
 145 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout)
 146 {
 147
 148     int P[2],bMaster,prank[2],i;
 149     int rNG,rMG,rKG;
 150     int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
 151     int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
 152     int C[3],rC[3],nP[2];
 153     int lsize;
 154     t_complex *lin=0,*lout=0;
 155     fft5d_plan plan;
 156     int s;
 157
 158     /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
 159 #ifdef GMX_MPI
 160     if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != 0)
 161     {
 162         MPI_Comm_size(comm[0],&P[0]);
 163         MPI_Comm_rank(comm[0],&prank[0]);
 164     }
 165     else
 166 #endif
 167     {
 168         P[0] = 1;
 169         prank[0] = 0;
 170     }
 171 #ifdef GMX_MPI
 172     if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != 0)
 173     {
 174         MPI_Comm_size(comm[1],&P[1]);
 175         MPI_Comm_rank(comm[1],&prank[1]);
 176     }
 177     else
 178 #endif
 179     {
 180         P[1] = 1;
 181         prank[1] = 0;
 182     }
 183
 184     bMaster=(prank[0]==0&&prank[1]==0);
 185
 186
 187     if (debug)
 188     {
 189         fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
 190                 P[0],P[1],prank[0],prank[1]);
 191     }
 192
 193     if (bMaster) {
 194         if (debug)
 195             fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
 196                 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
 197         /* The check below is not correct, one prime factor 11 or 13 is ok.
 198         if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
 199             printf("WARNING: FFT very slow with prime factors larger 7\n");
 200             printf("Change FFT size or in case you cannot change it look at\n");
 201             printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
 202         }
 203         */
 204     }
 205
 206     if (NG==0 || MG==0 || KG==0) {
 207         if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
 208         return 0;
 209     }
 210
 211     rNG=NG;rMG=MG;rKG=KG;
 212
 213     if (flags&FFT5D_REALCOMPLEX) {
 214         if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
 215         else {
 216             if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
 217             else KG=KG/2+1;
 218         }
 219     }
 220
 221
 222     /*for transpose we need to know the size for each processor not only our own size*/
 223
 224     N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
 225     M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
 226     K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
 227     oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
 228     oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
 229     oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
 230
 231     for (i=0;i<P[0];i++)
 232     {
 233         #define EVENDIST
 234         #ifndef EVENDIST
 235         oN0[i]=i*ceil((double)NG/P[0]);
 236         oM0[i]=i*ceil((double)MG/P[0]);
 237         oK0[i]=i*ceil((double)KG/P[0]);
 238         #else
 239         oN0[i]=(NG*i)/P[0];
 240         oM0[i]=(MG*i)/P[0];
 241         oK0[i]=(KG*i)/P[0];
 242         #endif
 243     }
 244     for (i=0;i<P[1];i++)
 245     {
 246         #ifndef EVENDIST
 247         oN1[i]=i*ceil((double)NG/P[1]);
 248         oM1[i]=i*ceil((double)MG/P[1]);
 249         oK1[i]=i*ceil((double)KG/P[1]);
 250         #else
 251         oN1[i]=(NG*i)/P[1];
 252         oM1[i]=(MG*i)/P[1];
 253         oK1[i]=(KG*i)/P[1];
 254         #endif
 255     }
 256     for (i=0;i<P[0]-1;i++)
 257     {
 258         N0[i]=oN0[i+1]-oN0[i];
 259         M0[i]=oM0[i+1]-oM0[i];
 260         K0[i]=oK0[i+1]-oK0[i];
 261     }
 262     N0[P[0]-1]=NG-oN0[P[0]-1];
 263     M0[P[0]-1]=MG-oM0[P[0]-1];
 264     K0[P[0]-1]=KG-oK0[P[0]-1];
 265     for (i=0;i<P[1]-1;i++)
 266     {
 267         N1[i]=oN1[i+1]-oN1[i];
 268         M1[i]=oM1[i+1]-oM1[i];
 269         K1[i]=oK1[i+1]-oK1[i];
 270     }
 271     N1[P[1]-1]=NG-oN1[P[1]-1];
 272     M1[P[1]-1]=MG-oM1[P[1]-1];
 273     K1[P[1]-1]=KG-oK1[P[1]-1];
 274
 275     /* for step 1-3 the local N,M,K sizes of the transposed system
 276        C: contiguous dimension, and nP: number of processor in subcommunicator
 277        for that step */
 278
 279
 280     pM[0] = M0[prank[0]];
 281     oM[0] = oM0[prank[0]];
 282     pK[0] = K1[prank[1]];
 283     oK[0] = oK1[prank[1]];
 284     C[0] = NG;
 285     rC[0] = rNG;
 286     if (!(flags&FFT5D_ORDER_YZ)) {
 287         N[0] = vmax(N1,P[1]);
 288         M[0] = M0[prank[0]];
 289         K[0] = vmax(K1,P[1]);
 290         pN[0] = N1[prank[1]];
 291         iNout[0] = N1;
 292         oNout[0] = oN1;
 293         nP[0] = P[1];
 294         C[1] = KG;
 295         rC[1] =rKG;
 296         N[1] = vmax(K0,P[0]);
 297         pN[1] = K0[prank[0]];
 298         iNin[1] = K1;
 299         oNin[1] = oK1;
 300         iNout[1] = K0;
 301         oNout[1] = oK0;
 302         M[1] = vmax(M0,P[0]);
 303         pM[1] = M0[prank[0]];
 304         oM[1] = oM0[prank[0]];
 305         K[1] = N1[prank[1]];
 306         pK[1] = N1[prank[1]];
 307         oK[1] = oN1[prank[1]];
 308         nP[1] = P[0];
 309         C[2] = MG;
 310         rC[2] = rMG;
 311         iNin[2] = M0;
 312         oNin[2] = oM0;
 313         M[2] = vmax(K0,P[0]);
 314         pM[2] = K0[prank[0]];
 315         oM[2] = oK0[prank[0]];
 316         K[2] = vmax(N1,P[1]);
 317         pK[2] = N1[prank[1]];
 318         oK[2] = oN1[prank[1]];
 319         free(N0); free(oN0); /*these are not used for this order*/
 320         free(M1); free(oM1); /*the rest is freed in destroy*/
 321     } else {
 322         N[0] = vmax(N0,P[0]);
 323         M[0] = vmax(M0,P[0]);
 324         K[0] = K1[prank[1]];
 325         pN[0] = N0[prank[0]];
 326         iNout[0] = N0;
 327         oNout[0] = oN0;
 328         nP[0] = P[0];
 329         C[1] = MG;
 330         rC[1] =rMG;
 331         N[1] = vmax(M1,P[1]);
 332         pN[1] = M1[prank[1]];
 333         iNin[1] = M0;
 334         oNin[1] = oM0;
 335         iNout[1] = M1;
 336         oNout[1] = oM1;
 337         M[1] = N0[prank[0]];
 338         pM[1] = N0[prank[0]];
 339         oM[1] = oN0[prank[0]];
 340         K[1] = vmax(K1,P[1]);
 341         pK[1] = K1[prank[1]];
 342         oK[1] = oK1[prank[1]];
 343         nP[1] = P[1];
 344         C[2] = KG;
 345         rC[2] = rKG;
 346         iNin[2] = K1;
 347         oNin[2] = oK1;
 348         M[2] = vmax(N0,P[0]);
 349         pM[2] = N0[prank[0]];
 350         oM[2] = oN0[prank[0]];
 351         K[2] = vmax(M1,P[1]);
 352         pK[2] = M1[prank[1]];
 353         oK[2] = oM1[prank[1]];
 354         free(N1); free(oN1); /*these are not used for this order*/
 355         free(K0); free(oK0); /*the rest is freed in destroy*/
 356     }
 357
 358     /*
 359       Difference between x-y-z regarding 2d decomposition is whether they are
 360       distributed along axis 1, 2 or both
 361     */
 362
 363     /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
 364     lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
 365     /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
 366     if (!(flags&FFT5D_NOMALLOC)) {
 367         lin = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
 368         lout = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
 369     } else {
 370         lin = *rlin;
 371         lout = *rlout;
 372     }
 373
 374     plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
 375
 376
 377 #ifdef FFT5D_THREADS   /*requires fftw with openmp and openmp*/
 378     FFTW(init_threads)();
 379     int nthreads;
 380     #pragma omp parallel
 381     {
 382         #pragma omp master
 383         {
 384             nthreads = omp_get_num_threads();
 385         }
 386     }
 387     printf("Running on %d threads\n",nthreads);
 388     FFTW(plan_with_nthreads)(nthreads);
 389
 390 #endif
 391
 392 #ifdef GMX_FFT_FFTW3  /*if not FFTW - then we don't do a 3d plan but insead only 1D plans */
 393     if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1))) {  /*don't do 3d plan in parallel or if in_place requested */
 394             int fftwflags=FFTW_DESTROY_INPUT;
 395             fftw_iodim dims[3];
 396             int inNG=NG,outMG=MG,outKG=KG;
 397
 398             FFTW_LOCK;
 399             if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
 400             if (flags&FFT5D_REALCOMPLEX) {
 401                 if (!(flags&FFT5D_BACKWARD)) {  /*input pointer is not complex*/
 402                     inNG*=2;
 403                 } else {                        /*output pointer is not complex*/
 404                     if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
 405                     else outKG*=2;
 406                 }
 407             }
 408
 409             if (!(flags&FFT5D_BACKWARD)) {
 410                 dims[0].n  = KG;
 411                 dims[1].n  = MG;
 412                 dims[2].n  = rNG;
 413
 414                 dims[0].is = inNG*MG;     /*N M K*/
 415                 dims[1].is = inNG;
 416                 dims[2].is = 1;
 417                 if (!(flags&FFT5D_ORDER_YZ)) {
 418                     dims[0].os = MG;       /*M K N*/
 419                     dims[1].os = 1;
 420                     dims[2].os = MG*KG;
 421                 } else  {
 422                     dims[0].os = 1;       /*K N M*/
 423                     dims[1].os = KG*NG;
 424                     dims[2].os = KG;
 425                 }
 426             } else {
 427                 if (!(flags&FFT5D_ORDER_YZ)) {
 428                     dims[0].n  = NG;
 429                     dims[1].n  = KG;
 430                     dims[2].n  = rMG;
 431
 432                     dims[0].is = 1;
 433                     dims[1].is = NG*MG;
 434                     dims[2].is = NG;
 435
 436                     dims[0].os = outMG*KG;
 437                     dims[1].os = outMG;
 438                     dims[2].os = 1;
 439                 } else {
 440                     dims[0].n  = MG;
 441                     dims[1].n  = NG;
 442                     dims[2].n  = rKG;
 443
 444                     dims[0].is = NG;
 445                     dims[1].is = 1;
 446                     dims[2].is = NG*MG;
 447
 448                     dims[0].os = outKG*NG;
 449                     dims[1].os = outKG;
 450                     dims[2].os = 1;
 451                 }
 452             }
 453             if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
 454                 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
 455                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 456                                      (real*)lin, (FFTW(complex) *)lout,
 457                                      /*flags*/ fftwflags);
 458             } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
 459                 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
 460                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 461                                      (FFTW(complex) *)lin, (real*)lout,
 462                                      /*flags*/ fftwflags);
 463             } else {
 464                 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
 465                                      /*howmany*/ 0, /*howmany_dims*/0 ,
 466                                      (FFTW(complex) *)lin, (FFTW(complex) *)lout,
 467                                      /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
 468             }
 469             FFTW_UNLOCK;
 470     }
 471     if (!plan->p3d) {  /* for decomposition and if 3d plan did not work */
 472 #endif /* GMX_FFT_FFTW3 */
 473         for (s=0;s<3;s++) {
 474             if (debug)
 475             {
 476                 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
 477                         s,rC[s],M[s],pK[s],C[s],lsize);
 478             }
 479             if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
 480                 gmx_fft_init_many_1d_real( &plan->p1d[s], rC[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 481             } else {
 482                 gmx_fft_init_many_1d     ( &plan->p1d[s],  C[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
 483             }
 484         }
 485 #ifdef GMX_FFT_FFTW3
 486     }
 487 #endif
 488     if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
 489         plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
 490     } else {
 491         plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
 492     }
 493 #ifdef FFT5D_MPI_TRANSPOSE
 494     FFTW_LOCK
 495     for (s=0;s<2;s++) {
 496         if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
 497             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
 498         else
 499             plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
 500     }
 501     FFTW_UNLOCK
 502 #endif
 503
 504
 505     plan->lin=lin;
 506     plan->lout=lout;
 507
 508     plan->NG=NG;plan->MG=MG;plan->KG=KG;
 509
 510     for (s=0;s<3;s++) {
 511         plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
 512         plan->oM[s]=oM[s];plan->oK[s]=oK[s];
 513         plan->C[s]=C[s];plan->rC[s]=rC[s];
 514         plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
 515     }
 516     for (s=0;s<2;s++) {
 517         plan->P[s]=nP[s];plan->coor[s]=prank[s];
 518     }
 519
 520 /*    plan->fftorder=fftorder;
 521     plan->direction=direction;
 522     plan->realcomplex=realcomplex;
 523 */
 524     plan->flags=flags;
 525     *rlin=lin;
 526     *rlout=lout;
 527     return plan;
 528 }
 529
 530
 531 enum order {
 532     XYZ,
 533     XZY,
 534     YXZ,
 535     YZX,
 536     ZXY,
 537     ZYX
 538 };
 539
 540
 541
 542 /*here x,y,z and N,M,K is in rotated coordinate system!!
 543   x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
 544   maxN,maxM,maxK is max size of local data
 545   pN, pM, pK is local size specific to current processor (only different to max if not divisible)
 546   NG, MG, KG is size of global data*/
 547 static void splitaxes(t_complex* lout,const t_complex* lin,
 548                       int maxN,int maxM,int maxK, int pN, int pM, int pK,
 549                       int P,int NG,int *N, int* oN) {
 550     int x,y,z,i;
 551     int in_i,out_i,in_z,out_z,in_y,out_y;
 552
 553     for (i=0;i<P;i++) { /*index cube along long axis*/
 554         in_i  = i*maxN*maxM*maxK;
 555         out_i = oN[i];
 556         for (z=0;z<pK;z++) { /*3. z l*/
 557             in_z  = in_i  + z*maxN*maxM;
 558             out_z = out_i + z*NG*pM;
 559             for (y=0;y<pM;y++) { /*2. y k*/
 560                 in_y  = in_z  + y*maxN;
 561                 out_y = out_z + y*NG;
 562                 for (x=0;x<N[i];x++) { /*1. x j*/
 563                     lout[in_y+x] = lin[out_y+x];
 564                     /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
 565                     /*before split data contiguos - thus if different processor get different amount oN is different*/
 566                 }
 567             }
 568         }
 569     }
 570 }
 571
 572 /*make axis contiguous again (after AllToAll) and also do local transpose*/
 573 /*transpose mayor and major dimension
 574   variables see above
 575   the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
 576   N,M,K local dimensions
 577   KG global size*/
 578 static void joinAxesTrans13(t_complex* lin,const t_complex* lout,
 579                             int maxN,int maxM,int maxK,int pN, int pM, int pK,
 580                             int P,int KG, int* K, int* oK)
 581 {
 582     int i,x,y,z;
 583     int in_i,out_i,in_x,out_x,in_z,out_z;
 584
 585     for (i=0;i<P;i++) { /*index cube along long axis*/
 586         in_i  = oK[i];
 587         out_i = i*maxM*maxN*maxK;
 588         for (x=0;x<pN;x++) { /*1.j*/
 589             in_x  = in_i  + x*KG*pM;
 590             out_x = out_i + x;
 591             for (z=0;z<K[i];z++) { /*3.l*/
 592                 in_z  = in_x  + z;
 593                 out_z = out_x + z*maxM*maxN;
 594                 for (y=0;y<pM;y++) { /*2.k*/
 595                     lin[in_z+y*KG] = lout[out_z+y*maxN];
 596                 }
 597             }
 598         }
 599     }
 600 }
 601
 602 /*make axis contiguous again (after AllToAll) and also do local transpose
 603   tranpose mayor and middle dimension
 604   variables see above
 605   the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
 606   N,M,K local size
 607   MG, global size*/
 608 static void joinAxesTrans12(t_complex* lin,const t_complex* lout,int maxN,int maxM,int maxK,int pN, int pM, int pK,
 609                 int P,int MG, int* M, int* oM) {
 610     int i,z,y,x;
 611     int in_i,out_i,in_z,out_z,in_x,out_x;
 612
 613     for (i=0;i<P;i++) { /*index cube along long axis*/
 614         in_i  = oM[i];
 615         out_i = i*maxM*maxN*maxK;
 616         for (z=0;z<pK;z++) {
 617             in_z  = in_i  + z*MG*pN;
 618             out_z = out_i + z*maxM*maxN;
 619             for (x=0;x<pN;x++) {
 620                 in_x  = in_z  + x*MG;
 621                 out_x = out_z + x;
 622                 for (y=0;y<M[i];y++) {
 623                     lin[in_x+y] = lout[out_x+y*maxN];
 624                 }
 625             }
 626         }
 627     }
 628 }
 629
 630
 631 static void rotate(int x[]) {
 632     int t=x[0];
 633 /*    x[0]=x[2];
 634     x[2]=x[1];
 635     x[1]=t;*/
 636     x[0]=x[1];
 637     x[1]=x[2];
 638     x[2]=t;
 639 }
 640
 641 /*compute the offset to compare or print transposed local data in original input coordinates
 642   xs matrix dimension size, xl dimension length, xc decomposition offset
 643   s: step in computation = number of transposes*/
 644 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
 645 /*    int direction = plan->direction;
 646     int fftorder = plan->fftorder;*/
 647
 648     int o;
 649     int pos[3],i;
 650     int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
 651         *C=plan->C, *rC=plan->rC;
 652
 653     NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
 654
 655     if (!(plan->flags&FFT5D_ORDER_YZ)) {
 656         switch (s) {
 657         case 0: o=XYZ; break;
 658         case 1: o=ZYX; break;
 659         case 2: o=YZX; break;
 660         default: assert(0);
 661         }
 662     } else {
 663         switch (s) {
 664         case 0: o=XYZ; break;
 665         case 1: o=YXZ; break;
 666         case 2: o=ZXY; break;
 667         default: assert(0);
 668         }
 669     }
 670
 671     switch (o) {
 672         case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
 673         case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
 674         case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
 675         case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
 676         case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
 677         case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
 678     }
 679     /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
 680
 681     /*xs, xl give dimension size and data length in local transposed coordinate system
 682       for 0(/1/2): x(/y/z) in original coordinate system*/
 683     for (i=0;i<3;i++) {
 684         switch (pos[i]) {
 685         case 1: xs[i]=1;         xc[i]=0;     xl[i]=C[s];break;
 686         case 2: xs[i]=C[s];      xc[i]=oM[s]; xl[i]=pM[s];break;
 687         case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
 688         }
 689     }
 690     /*input order is different for test program to match FFTW order
 691       (important for complex to real)*/
 692     if (plan->flags&FFT5D_BACKWARD) {
 693         rotate(xs);
 694         rotate(xl);
 695         rotate(xc);
 696         rotate(NG);
 697         if (plan->flags&FFT5D_ORDER_YZ) {
 698             rotate(xs);
 699             rotate(xl);
 700             rotate(xc);
 701             rotate(NG);
 702         }
 703     }
 704     if (plan->flags&FFT5D_REALCOMPLEX && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || (plan->flags&FFT5D_BACKWARD && s==2))) {
 705         xl[0] = rC[s];
 706     }
 707 }
 708
 709 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
 710     int x,y,z,l;
 711     int *coor = plan->coor;
 712     int xs[3],xl[3],xc[3],NG[3];
 713     int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
 714     compute_offsets(plan,xs,xl,xc,NG,s);
 715     fprintf(debug,txt,coor[0],coor[1],s);
 716     /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
 717     for(z=0;z<xl[2];z++) {
 718         for(y=0;y<xl[1];y++) {
 719             fprintf(debug,"%d %d: ",coor[0],coor[1]);
 720             for (x=0;x<xl[0];x++) {
 721                 for (l=0;l<ll;l++) {
 722                     fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
 723                 }
 724                 fprintf(debug,",");
 725             }
 726             fprintf(debug,"\n");
 727         }
 728     }
 729 }
 730
 731 void fft5d_execute(fft5d_plan plan,fft5d_time times) {
 732     t_complex *lin = plan->lin;
 733     t_complex *lout = plan->lout;
 734
 735     gmx_fft_t *p1d=plan->p1d;
 736 #ifdef FFT5D_MPI_TRANSPOSE
 737     FFTW(plan) *mpip=plan->mpip;
 738 #endif
 739 #ifdef GMX_MPI
 740     MPI_Comm *cart=plan->cart;
 741 #endif
 742
 743     double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
 744     int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
 745         *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
 746     int s=0;
 747
 748
 749 #ifdef GMX_FFT_FFTW3
 750     if (plan->p3d) {
 751         if (times!=0)
 752             time=MPI_Wtime();
 753         FFTW(execute)(plan->p3d);
 754         if (times!=0)
 755             times->fft+=MPI_Wtime()-time;
 756         return;
 757     }
 758 #endif
 759
 760     /*lin: x,y,z*/
 761     if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: copy in lin\n", s, plan);
 762     for (s=0;s<2;s++) {
 763         if (times!=0)
 764             time=MPI_Wtime();
 765
 766         if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0) {
 767             gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
 768         } else {
 769             gmx_fft_many_1d(     p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin,lout);
 770         }
 771         if (times!=0)
 772             time_fft+=MPI_Wtime()-time;
 773
 774         if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
 775
 776 #ifdef GMX_MPI
 777         if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] !=0 && P[s]>1 )
 778         {
 779             if (times!=0)
 780                 time=MPI_Wtime();
 781             /*prepare for AllToAll
 782               1. (most outer) axes (x) is split into P[s] parts of size N[s]
 783               for sending*/
 784             splitaxes(lin,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s]);
 785
 786             if (times!=0)
 787             {
 788                 time_local+=MPI_Wtime()-time;
 789
 790                 /*send, recv*/
 791                 time=MPI_Wtime();
 792             }
 793
 794 #ifdef FFT5D_MPI_TRANSPOSE
 795             FFTW(execute)(mpip[s]);
 796 #else
 797             if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
 798                 MPI_Alltoall(lin,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
 799             else
 800                 MPI_Alltoall(lin,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
 801 #endif /*FFT5D_MPI_TRANSPOSE*/
 802             if (times!=0)
 803                 time_mpi[s]=MPI_Wtime()-time;
 804         }
 805 #endif /*GMX_MPI*/
 806
 807
 808         if (times!=0)
 809             time=MPI_Wtime();
 810         /*bring back in matrix form
 811           thus make  new 1. axes contiguos
 812           also local transpose 1 and 2/3 */
 813         if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
 814             joinAxesTrans13(lin,lout,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
 815         else
 816             joinAxesTrans12(lin,lout,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
 817         if (times!=0)
 818             time_local+=MPI_Wtime()-time;
 819
 820         if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
 821
 822         /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
 823     }
 824
 825     if (times!=0)
 826         time=MPI_Wtime();
 827     if (plan->flags&FFT5D_INPLACE) lout=lin;
 828     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
 829         gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
 830     } else {
 831         gmx_fft_many_1d(     p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin,lout);
 832     }
 833
 834     if (times!=0)
 835         time_fft+=MPI_Wtime()-time;
 836     if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
 837     /*if (debug) print_localdata(lout, "%d %d: FFT in y\n", N1, M, K0, YZX, coor);*/
 838
 839     if (times!=0) {
 840         times->fft+=time_fft;
 841         times->local+=time_local;
 842         times->mpi2+=time_mpi[1];
 843         times->mpi1+=time_mpi[0];
 844     }
 845 }
 846
 847 void fft5d_destroy(fft5d_plan plan) {
 848     int s;
 849     for (s=0;s<3;s++) {
 850         gmx_many_fft_destroy(plan->p1d[s]);
 851         if (plan->iNin[s]) {
 852             free(plan->iNin[s]);
 853             plan->iNin[s]=0;
 854         }
 855         if (plan->oNin[s]) {
 856             free(plan->oNin[s]);
 857             plan->oNin[s]=0;
 858         }
 859         if (plan->iNout[s]) {
 860             free(plan->iNout[s]);
 861             plan->iNout[s]=0;
 862         }
 863         if (plan->oNout[s]) {
 864             free(plan->oNout[s]);
 865             plan->oNout[s]=0;
 866         }
 867     }
 868 #ifdef GMX_FFT_FFTW3
 869     FFTW_LOCK;
 870 #ifdef FFT5D_MPI_TRANSPOS
 871     for (s=0;s<2;s++)
 872         FFTW(destroy_plan)(plan->mpip[s]);
 873 #endif /* FFT5D_MPI_TRANSPOS */
 874 #endif /* GMX_FFT_FFTW3 */
 875
 876     /*We can't free lin/lout here - is allocated by gmx_calloc_aligned which can't be freed*/
 877
 878
 879 #ifdef FFT5D_THREADS
 880     FFTW(cleanup_threads)();
 881 #endif
 882
 883     free(plan);
 884 }
 885
 886 /*Is this better than direct access of plan? enough data?
 887   here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
 888 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
 889     *N1=plan->N[0];
 890     *M0=plan->M[0];
 891     *K1=plan->K[0];
 892     *K0=plan->N[1];
 893
 894     *coor=plan->coor;
 895 }
 896
 897
 898 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
 899   of processor dimensions*/
 900 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout) {
 901     MPI_Comm cart[2]={0};
 902 #ifdef GMX_MPI
 903     int size=1,prank=0;
 904     int P[2];
 905     int coor[2];
 906     int wrap[]={0,0};
 907     MPI_Comm gcart;
 908     int rdim1[] = {0,1}, rdim2[] = {1,0};
 909
 910     MPI_Comm_size(comm,&size);
 911     MPI_Comm_rank(comm,&prank);
 912
 913     if (P0==0) P0 = lfactor(size);
 914     if (size%P0!=0) {
 915         if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
 916         P0 = lfactor(size);
 917     }
 918
 919     P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
 920
 921     /*Difference between x-y-z regarding 2d decomposition is whether they are
 922       distributed along axis 1, 2 or both*/
 923
 924     MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
 925     MPI_Cart_get(gcart,2,P,wrap,coor);
 926     MPI_Cart_sub(gcart, rdim1 , &cart[0]);
 927     MPI_Cart_sub(gcart, rdim2 , &cart[1]);
 928 #endif
 929     return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout);
 930 }
 931
 932
 933
 934 /*prints in original coordinate system of data (as the input to FFT)*/
 935 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
 936     int xs[3],xl[3],xc[3],NG[3];
 937     int x,y,z,l;
 938     int *coor = plan->coor;
 939     int ll=2; /*compare ll values per element (has to be 2 for complex)*/
 940     if (plan->flags&FFT5D_REALCOMPLEX && plan->flags&FFT5D_BACKWARD)
 941     {
 942         ll=1;
 943     }
 944
 945     compute_offsets(plan,xs,xl,xc,NG,2);
 946     if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
 947     for (z=0;z<xl[2];z++) {
 948         for(y=0;y<xl[1];y++) {
 949             if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
 950             for (x=0;x<xl[0];x++) {
 951                 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
 952                     real a,b;
 953                     a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
 954                     if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
 955                     if (!bothLocal)
 956                         b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
 957                     else
 958                         b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
 959                     if (plan->flags&FFT5D_DEBUG) {
 960                         printf("%f %f, ",a,b);
 961                     } else {
 962                         if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
 963                             printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
 964                         }
 965 /*                        assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
 966                     }
 967                 }
 968                 if (plan->flags&FFT5D_DEBUG) printf(",");
 969             }
 970             if (plan->flags&FFT5D_DEBUG) printf("\n");
 971         }
 972     }
 973
 974 }
 975