Replace our fftpack version with Numpy's version
[gromacs.git] / src / mdlib / fft5d.c
blob86091776d80050aedb1e9dcb8d740ddb7d6664c0
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2 *
3 * This source code is part of
4 *
5 * G R O M A C S
6 *
7 * GROningen MAchine for Chemical Simulations
8 *
9 * VERSION 4.5
10 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12 * Copyright (c) 2001-2008, The GROMACS development team,
13 * check out http://www.gromacs.org for more information.
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version 2
18 * of the License, or (at your option) any later version.
20 * If you want to redistribute modifications, please consider that
21 * scientific software is very special. Version control is crucial -
22 * bugs must be traceable. We will be happy to consider code for
23 * inclusion in the official distribution, but derived work must not
24 * be called official GROMACS. Details are found in the README & COPYING
25 * files - if they are missing, get the official version at www.gromacs.org.
27 * To help us fund GROMACS development, we humbly ask that you cite
28 * the papers on the package - you can find them in the top README file.
30 * For more info, check our website at http://www.gromacs.org
32 * And Hey:
33 * Groningen Machine for Chemical Simulation
35 #ifdef HAVE_CONFIG_H
36 #include <config.h>
37 #endif
40 #include <stdio.h>
41 #include <stdlib.h>
42 #include <string.h>
44 #ifdef NOGMX
45 #define GMX_PARALLEL_ENV_INITIALIZED 1
46 #else
47 #include "main.h"
48 #define GMX_PARALLEL_ENV_INITIALIZED gmx_parallel_env_initialized()
49 #endif
51 #ifdef GMX_LIB_MPI
52 #include <mpi.h>
53 #endif
54 #ifdef GMX_THREADS
55 #include "tmpi.h"
56 #endif
58 #ifdef FFT5D_THREADS
59 #include <omp.h>
60 /* requires fftw compiled with openmp */
61 #define FFT5D_FFTW_THREADS
62 #endif
64 #include "fft5d.h"
65 #include <float.h>
66 #include <math.h>
67 #include <assert.h>
69 #ifndef __FLT_EPSILON__
70 #define __FLT_EPSILON__ FLT_EPSILON
71 #define __DBL_EPSILON__ DBL_EPSILON
72 #endif
74 #ifdef NOGMX
75 FILE* debug=0;
76 #endif
78 #include "gmx_fatal.h"
81 #ifdef GMX_FFT_FFTW3
82 #ifdef GMX_THREADS
83 /* none of the fftw3 calls, except execute(), are thread-safe, so
84 we need to serialize them with this mutex. */
85 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
87 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex);
88 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex);
89 #else /* GMX_THREADS */
90 #define FFTW_LOCK
91 #define FFTW_UNLOCK
92 #endif /* GMX_THREADS */
93 #endif /* GMX_FFT_FFTW3 */
95 static double fft5d_fmax(double a, double b){
96 return (a>b)?a:b;
99 /* largest factor smaller than sqrt */
100 static int lfactor(int z) {
101 int i;
102 for (i=sqrt(z);;i--)
103 if (z%i==0) return i;
106 /* largest factor */
107 static int l2factor(int z) {
108 int i;
109 if (z==1) return 1;
110 for (i=z/2;;i--)
111 if (z%i==0) return i;
114 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
115 static int lpfactor(int z) {
116 int f = l2factor(z);
117 if (f==1) return z;
118 return fft5d_fmax(lpfactor(f),lpfactor(z/f));
121 #ifndef GMX_MPI
122 #ifdef HAVE_GETTIMEOFDAY
123 #include <sys/time.h>
124 double MPI_Wtime() {
125 struct timeval tv;
126 gettimeofday(&tv,0);
127 return tv.tv_sec+tv.tv_usec*1e-6;
129 #else
130 double MPI_Wtime() {
131 return 0.0;
133 #endif
134 #endif
136 static int vmax(int* a, int s) {
137 int i,max=0;
138 for (i=0;i<s;i++)
140 if (a[i]>max) max=a[i];
142 return max;
146 copied here from fftgrid, because:
147 1. function there not publically available
148 2. not sure whether we keep fftgrid
149 3. less dependencies for fft5d
151 Only used for non-fftw case
153 static void *
154 gmx_calloc_aligned(size_t size)
156 void *p0,*p;
158 /*We initialize by zero for Valgrind
159 For non-divisible case we communicate more than the data.
160 If we don't initialize the data we communicate uninitialized data*/
161 p0 = calloc(size+32,1);
163 if(p0 == NULL)
165 gmx_fatal(FARGS,"Failed to allocated %u bytes of aligned memory.",size+32);
168 p = (void *) (((size_t) p0 + 32) & (~((size_t) 31)));
170 /* Yeah, yeah, we cannot free this pointer, but who cares... */
171 return p;
175 /* NxMxK the size of the data
176 * comm communicator to use for fft5d
177 * P0 number of processor in 1st axes (can be null for automatic)
178 * lin is allocated by fft5d because size of array is only known after planning phase */
179 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout)
182 int P[2],bMaster,prank[2],i;
183 int rNG,rMG,rKG;
184 int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
185 int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
186 int C[3],rC[3],nP[2];
187 int lsize;
188 t_complex *lin=0,*lout=0;
189 fft5d_plan plan;
190 int s;
192 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
193 #ifdef GMX_MPI
194 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
196 MPI_Comm_size(comm[0],&P[0]);
197 MPI_Comm_rank(comm[0],&prank[0]);
199 else
200 #endif
202 P[0] = 1;
203 prank[0] = 0;
205 #ifdef GMX_MPI
206 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
208 MPI_Comm_size(comm[1],&P[1]);
209 MPI_Comm_rank(comm[1],&prank[1]);
211 else
212 #endif
214 P[1] = 1;
215 prank[1] = 0;
218 bMaster=(prank[0]==0&&prank[1]==0);
221 if (debug)
223 fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
224 P[0],P[1],prank[0],prank[1]);
227 if (bMaster) {
228 if (debug)
229 fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
230 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
231 /* The check below is not correct, one prime factor 11 or 13 is ok.
232 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
233 printf("WARNING: FFT very slow with prime factors larger 7\n");
234 printf("Change FFT size or in case you cannot change it look at\n");
235 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
240 if (NG==0 || MG==0 || KG==0) {
241 if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
242 return 0;
245 rNG=NG;rMG=MG;rKG=KG;
247 if (flags&FFT5D_REALCOMPLEX) {
248 if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
249 else {
250 if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
251 else KG=KG/2+1;
256 /*for transpose we need to know the size for each processor not only our own size*/
258 N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
259 M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
260 K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
261 oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
262 oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
263 oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
265 for (i=0;i<P[0];i++)
267 #define EVENDIST
268 #ifndef EVENDIST
269 oN0[i]=i*ceil((double)NG/P[0]);
270 oM0[i]=i*ceil((double)MG/P[0]);
271 oK0[i]=i*ceil((double)KG/P[0]);
272 #else
273 oN0[i]=(NG*i)/P[0];
274 oM0[i]=(MG*i)/P[0];
275 oK0[i]=(KG*i)/P[0];
276 #endif
278 for (i=0;i<P[1];i++)
280 #ifndef EVENDIST
281 oN1[i]=i*ceil((double)NG/P[1]);
282 oM1[i]=i*ceil((double)MG/P[1]);
283 oK1[i]=i*ceil((double)KG/P[1]);
284 #else
285 oN1[i]=(NG*i)/P[1];
286 oM1[i]=(MG*i)/P[1];
287 oK1[i]=(KG*i)/P[1];
288 #endif
290 for (i=0;i<P[0]-1;i++)
292 N0[i]=oN0[i+1]-oN0[i];
293 M0[i]=oM0[i+1]-oM0[i];
294 K0[i]=oK0[i+1]-oK0[i];
296 N0[P[0]-1]=NG-oN0[P[0]-1];
297 M0[P[0]-1]=MG-oM0[P[0]-1];
298 K0[P[0]-1]=KG-oK0[P[0]-1];
299 for (i=0;i<P[1]-1;i++)
301 N1[i]=oN1[i+1]-oN1[i];
302 M1[i]=oM1[i+1]-oM1[i];
303 K1[i]=oK1[i+1]-oK1[i];
305 N1[P[1]-1]=NG-oN1[P[1]-1];
306 M1[P[1]-1]=MG-oM1[P[1]-1];
307 K1[P[1]-1]=KG-oK1[P[1]-1];
309 /* for step 1-3 the local N,M,K sizes of the transposed system
310 C: contiguous dimension, and nP: number of processor in subcommunicator
311 for that step */
314 pM[0] = M0[prank[0]];
315 oM[0] = oM0[prank[0]];
316 pK[0] = K1[prank[1]];
317 oK[0] = oK1[prank[1]];
318 C[0] = NG;
319 rC[0] = rNG;
320 if (!(flags&FFT5D_ORDER_YZ)) {
321 N[0] = vmax(N1,P[1]);
322 M[0] = M0[prank[0]];
323 K[0] = vmax(K1,P[1]);
324 pN[0] = N1[prank[1]];
325 iNout[0] = N1;
326 oNout[0] = oN1;
327 nP[0] = P[1];
328 C[1] = KG;
329 rC[1] =rKG;
330 N[1] = vmax(K0,P[0]);
331 pN[1] = K0[prank[0]];
332 iNin[1] = K1;
333 oNin[1] = oK1;
334 iNout[1] = K0;
335 oNout[1] = oK0;
336 M[1] = vmax(M0,P[0]);
337 pM[1] = M0[prank[0]];
338 oM[1] = oM0[prank[0]];
339 K[1] = N1[prank[1]];
340 pK[1] = N1[prank[1]];
341 oK[1] = oN1[prank[1]];
342 nP[1] = P[0];
343 C[2] = MG;
344 rC[2] = rMG;
345 iNin[2] = M0;
346 oNin[2] = oM0;
347 M[2] = vmax(K0,P[0]);
348 pM[2] = K0[prank[0]];
349 oM[2] = oK0[prank[0]];
350 K[2] = vmax(N1,P[1]);
351 pK[2] = N1[prank[1]];
352 oK[2] = oN1[prank[1]];
353 free(N0); free(oN0); /*these are not used for this order*/
354 free(M1); free(oM1); /*the rest is freed in destroy*/
355 } else {
356 N[0] = vmax(N0,P[0]);
357 M[0] = vmax(M0,P[0]);
358 K[0] = K1[prank[1]];
359 pN[0] = N0[prank[0]];
360 iNout[0] = N0;
361 oNout[0] = oN0;
362 nP[0] = P[0];
363 C[1] = MG;
364 rC[1] =rMG;
365 N[1] = vmax(M1,P[1]);
366 pN[1] = M1[prank[1]];
367 iNin[1] = M0;
368 oNin[1] = oM0;
369 iNout[1] = M1;
370 oNout[1] = oM1;
371 M[1] = N0[prank[0]];
372 pM[1] = N0[prank[0]];
373 oM[1] = oN0[prank[0]];
374 K[1] = vmax(K1,P[1]);
375 pK[1] = K1[prank[1]];
376 oK[1] = oK1[prank[1]];
377 nP[1] = P[1];
378 C[2] = KG;
379 rC[2] = rKG;
380 iNin[2] = K1;
381 oNin[2] = oK1;
382 M[2] = vmax(N0,P[0]);
383 pM[2] = N0[prank[0]];
384 oM[2] = oN0[prank[0]];
385 K[2] = vmax(M1,P[1]);
386 pK[2] = M1[prank[1]];
387 oK[2] = oM1[prank[1]];
388 free(N1); free(oN1); /*these are not used for this order*/
389 free(K0); free(oK0); /*the rest is freed in destroy*/
393 Difference between x-y-z regarding 2d decomposition is whether they are
394 distributed along axis 1, 2 or both
397 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
398 lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
399 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
400 if (!(flags&FFT5D_NOMALLOC)) {
401 lin = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
402 lout = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
403 } else {
404 lin = *rlin;
405 lout = *rlout;
408 plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
411 #ifdef FFT5D_THREADS
412 #ifdef FFT5D_FFTW_THREADS
413 FFTW(init_threads)();
414 int nthreads;
415 #pragma omp parallel
417 #pragma omp master
419 nthreads = omp_get_num_threads();
422 if (prank[0] == 0 && prank[1] == 0)
424 printf("Running fftw on %d threads\n",nthreads);
426 FFTW(plan_with_nthreads)(nthreads);
427 #endif
428 #endif
430 #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but insead only 1D plans */
431 if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1))) { /*don't do 3d plan in parallel or if in_place requested */
432 int fftwflags=FFTW_DESTROY_INPUT;
433 FFTW(iodim) dims[3];
434 int inNG=NG,outMG=MG,outKG=KG;
436 FFTW_LOCK;
437 if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
438 if (flags&FFT5D_REALCOMPLEX) {
439 if (!(flags&FFT5D_BACKWARD)) { /*input pointer is not complex*/
440 inNG*=2;
441 } else { /*output pointer is not complex*/
442 if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
443 else outKG*=2;
447 if (!(flags&FFT5D_BACKWARD)) {
448 dims[0].n = KG;
449 dims[1].n = MG;
450 dims[2].n = rNG;
452 dims[0].is = inNG*MG; /*N M K*/
453 dims[1].is = inNG;
454 dims[2].is = 1;
455 if (!(flags&FFT5D_ORDER_YZ)) {
456 dims[0].os = MG; /*M K N*/
457 dims[1].os = 1;
458 dims[2].os = MG*KG;
459 } else {
460 dims[0].os = 1; /*K N M*/
461 dims[1].os = KG*NG;
462 dims[2].os = KG;
464 } else {
465 if (!(flags&FFT5D_ORDER_YZ)) {
466 dims[0].n = NG;
467 dims[1].n = KG;
468 dims[2].n = rMG;
470 dims[0].is = 1;
471 dims[1].is = NG*MG;
472 dims[2].is = NG;
474 dims[0].os = outMG*KG;
475 dims[1].os = outMG;
476 dims[2].os = 1;
477 } else {
478 dims[0].n = MG;
479 dims[1].n = NG;
480 dims[2].n = rKG;
482 dims[0].is = NG;
483 dims[1].is = 1;
484 dims[2].is = NG*MG;
486 dims[0].os = outKG*NG;
487 dims[1].os = outKG;
488 dims[2].os = 1;
491 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
492 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
493 /*howmany*/ 0, /*howmany_dims*/0 ,
494 (real*)lin, (FFTW(complex) *)lout,
495 /*flags*/ fftwflags);
496 } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
497 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
498 /*howmany*/ 0, /*howmany_dims*/0 ,
499 (FFTW(complex) *)lin, (real*)lout,
500 /*flags*/ fftwflags);
501 } else {
502 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
503 /*howmany*/ 0, /*howmany_dims*/0 ,
504 (FFTW(complex) *)lin, (FFTW(complex) *)lout,
505 /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
507 FFTW_UNLOCK;
509 if (!plan->p3d) { /* for decomposition and if 3d plan did not work */
510 #endif /* GMX_FFT_FFTW3 */
511 for (s=0;s<3;s++) {
512 if (debug)
514 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
515 s,rC[s],M[s],pK[s],C[s],lsize);
517 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
518 gmx_fft_init_many_1d_real( &plan->p1d[s], rC[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
519 } else {
520 gmx_fft_init_many_1d ( &plan->p1d[s], C[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
523 #ifdef GMX_FFT_FFTW3
525 #endif
526 if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
527 plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
528 } else {
529 plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
531 #ifdef FFT5D_MPI_TRANSPOSE
532 FFTW_LOCK
533 for (s=0;s<2;s++) {
534 if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
535 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
536 else
537 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
539 FFTW_UNLOCK
540 #endif
543 plan->lin=lin;
544 plan->lout=lout;
546 plan->NG=NG;plan->MG=MG;plan->KG=KG;
548 for (s=0;s<3;s++) {
549 plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
550 plan->oM[s]=oM[s];plan->oK[s]=oK[s];
551 plan->C[s]=C[s];plan->rC[s]=rC[s];
552 plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
554 for (s=0;s<2;s++) {
555 plan->P[s]=nP[s];plan->coor[s]=prank[s];
558 /* plan->fftorder=fftorder;
559 plan->direction=direction;
560 plan->realcomplex=realcomplex;
562 plan->flags=flags;
563 *rlin=lin;
564 *rlout=lout;
565 return plan;
569 enum order {
570 XYZ,
571 XZY,
572 YXZ,
573 YZX,
574 ZXY,
580 /*here x,y,z and N,M,K is in rotated coordinate system!!
581 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
582 maxN,maxM,maxK is max size of local data
583 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
584 NG, MG, KG is size of global data*/
585 static void splitaxes(t_complex* lout,const t_complex* lin,
586 int maxN,int maxM,int maxK, int pN, int pM, int pK,
587 int P,int NG,int *N, int* oN)
589 int x,y,z,i;
590 int in_i,out_i,in_z,out_z,in_y,out_y;
592 #ifdef FFT5D_THREADS
593 int zi;
595 /* In the thread parallel case we want to loop over z and i
596 * in a single for loop to allow for better load balancing.
598 #pragma omp parallel for private(z,in_z,out_z,i,in_i,out_i,y,in_y,out_y,x) schedule(static)
599 for (zi=0; zi<pK*P; zi++)
601 z = zi/P;
602 i = zi - z*P;
603 #else
604 for (z=0; z<pK; z++) /*3. z l*/
606 #endif
607 in_z = z*maxN*maxM;
608 out_z = z*NG*pM;
610 #ifndef FFT5D_THREADS
611 for (i=0; i<P; i++) /*index cube along long axis*/
612 #endif
614 in_i = in_z + i*maxN*maxM*maxK;
615 out_i = out_z + oN[i];
616 for (y=0;y<pM;y++) { /*2. y k*/
617 in_y = in_i + y*maxN;
618 out_y = out_i + y*NG;
619 for (x=0;x<N[i];x++) { /*1. x j*/
620 lout[in_y+x] = lin[out_y+x];
621 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
622 /*before split data contiguos - thus if different processor get different amount oN is different*/
629 /*make axis contiguous again (after AllToAll) and also do local transpose*/
630 /*transpose mayor and major dimension
631 variables see above
632 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
633 N,M,K local dimensions
634 KG global size*/
635 static void joinAxesTrans13(t_complex* lin,const t_complex* lout,
636 int maxN,int maxM,int maxK,int pN, int pM, int pK,
637 int P,int KG, int* K, int* oK)
639 int i,x,y,z;
640 int in_i,out_i,in_x,out_x,in_z,out_z;
642 #ifdef FFT5D_THREADS
643 int xi;
645 /* In the thread parallel case we want to loop over x and i
646 * in a single for loop to allow for better load balancing.
648 #pragma omp parallel for private(x,in_x,out_x,i,in_i,out_i,z,in_z,out_z,y) schedule(static)
649 for (xi=0; xi<pN*P; xi++)
651 x = xi/P;
652 i = xi - x*P;
653 #else
654 for (x=0;x<pN;x++) /*1.j*/
656 #endif
657 in_x = x*KG*pM;
658 out_x = x;
660 #ifndef FFT5D_THREADS
661 for (i=0;i<P;i++) /*index cube along long axis*/
662 #endif
664 in_i = in_x + oK[i];
665 out_i = out_x + i*maxM*maxN*maxK;
666 for (z=0;z<K[i];z++) /*3.l*/
668 in_z = in_i + z;
669 out_z = out_i + z*maxM*maxN;
670 for (y=0;y<pM;y++) { /*2.k*/
671 lin[in_z+y*KG] = lout[out_z+y*maxN];
678 /*make axis contiguous again (after AllToAll) and also do local transpose
679 tranpose mayor and middle dimension
680 variables see above
681 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
682 N,M,K local size
683 MG, global size*/
684 static void joinAxesTrans12(t_complex* lin,const t_complex* lout,int maxN,int maxM,int maxK,int pN, int pM, int pK,
685 int P,int MG, int* M, int* oM) {
686 int i,z,y,x;
687 int in_i,out_i,in_z,out_z,in_x,out_x;
689 #ifdef FFT5D_THREADS
690 int zi;
692 /* In the thread parallel case we want to loop over z and i
693 * in a single for loop to allow for better load balancing.
695 #pragma omp parallel for private(i,in_i,out_i,z,in_z,out_z,in_x,out_x,x,y) schedule(static)
696 for (zi=0; zi<pK*P; zi++)
698 z = zi/P;
699 i = zi - z*P;
700 #else
701 for (z=0; z<pK; z++)
703 #endif
704 in_z = z*MG*pN;
705 out_z = z*maxM*maxN;
707 #ifndef FFT5D_THREADS
708 for (i=0; i<P; i++) /*index cube along long axis*/
709 #endif
711 in_i = in_z + oM[i];
712 out_i = out_z + i*maxM*maxN*maxK;
713 for (x=0;x<pN;x++) {
714 in_x = in_i + x*MG;
715 out_x = out_i + x;
716 for (y=0;y<M[i];y++) {
717 lin[in_x+y] = lout[out_x+y*maxN];
725 static void rotate(int x[]) {
726 int t=x[0];
727 /* x[0]=x[2];
728 x[2]=x[1];
729 x[1]=t;*/
730 x[0]=x[1];
731 x[1]=x[2];
732 x[2]=t;
735 /*compute the offset to compare or print transposed local data in original input coordinates
736 xs matrix dimension size, xl dimension length, xc decomposition offset
737 s: step in computation = number of transposes*/
738 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
739 /* int direction = plan->direction;
740 int fftorder = plan->fftorder;*/
742 int o;
743 int pos[3],i;
744 int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
745 *C=plan->C, *rC=plan->rC;
747 NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
749 if (!(plan->flags&FFT5D_ORDER_YZ)) {
750 switch (s) {
751 case 0: o=XYZ; break;
752 case 1: o=ZYX; break;
753 case 2: o=YZX; break;
754 default: assert(0);
756 } else {
757 switch (s) {
758 case 0: o=XYZ; break;
759 case 1: o=YXZ; break;
760 case 2: o=ZXY; break;
761 default: assert(0);
765 switch (o) {
766 case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
767 case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
768 case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
769 case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
770 case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
771 case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
773 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
775 /*xs, xl give dimension size and data length in local transposed coordinate system
776 for 0(/1/2): x(/y/z) in original coordinate system*/
777 for (i=0;i<3;i++) {
778 switch (pos[i]) {
779 case 1: xs[i]=1; xc[i]=0; xl[i]=C[s];break;
780 case 2: xs[i]=C[s]; xc[i]=oM[s]; xl[i]=pM[s];break;
781 case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
784 /*input order is different for test program to match FFTW order
785 (important for complex to real)*/
786 if (plan->flags&FFT5D_BACKWARD) {
787 rotate(xs);
788 rotate(xl);
789 rotate(xc);
790 rotate(NG);
791 if (plan->flags&FFT5D_ORDER_YZ) {
792 rotate(xs);
793 rotate(xl);
794 rotate(xc);
795 rotate(NG);
798 if (plan->flags&FFT5D_REALCOMPLEX && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || (plan->flags&FFT5D_BACKWARD && s==2))) {
799 xl[0] = rC[s];
803 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
804 int x,y,z,l;
805 int *coor = plan->coor;
806 int xs[3],xl[3],xc[3],NG[3];
807 int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
808 compute_offsets(plan,xs,xl,xc,NG,s);
809 fprintf(debug,txt,coor[0],coor[1],s);
810 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
811 for(z=0;z<xl[2];z++) {
812 for(y=0;y<xl[1];y++) {
813 fprintf(debug,"%d %d: ",coor[0],coor[1]);
814 for (x=0;x<xl[0];x++) {
815 for (l=0;l<ll;l++) {
816 fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
818 fprintf(debug,",");
820 fprintf(debug,"\n");
825 void fft5d_execute(fft5d_plan plan,fft5d_time times) {
826 t_complex *lin = plan->lin;
827 t_complex *lout = plan->lout;
829 gmx_fft_t *p1d=plan->p1d;
830 #ifdef FFT5D_MPI_TRANSPOSE
831 FFTW(plan) *mpip=plan->mpip;
832 #endif
833 #ifdef GMX_MPI
834 MPI_Comm *cart=plan->cart;
835 #endif
837 double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
838 int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
839 *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
840 int s=0;
843 #ifdef GMX_FFT_FFTW3
844 if (plan->p3d) {
845 if (times!=0)
846 time=MPI_Wtime();
847 FFTW(execute)(plan->p3d);
848 if (times!=0)
849 times->fft+=MPI_Wtime()-time;
850 return;
852 #endif
854 /*lin: x,y,z*/
855 if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: copy in lin\n", s, plan);
856 for (s=0;s<2;s++) {
857 if (times!=0)
858 time=MPI_Wtime();
860 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0) {
861 gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
862 } else {
863 gmx_fft_many_1d( p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin,lout);
865 if (times!=0)
866 time_fft+=MPI_Wtime()-time;
868 if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
870 #ifdef GMX_MPI
871 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s]!=MPI_COMM_NULL && P[s]>1)
873 if (times!=0)
874 time=MPI_Wtime();
875 /*prepare for AllToAll
876 1. (most outer) axes (x) is split into P[s] parts of size N[s]
877 for sending*/
878 splitaxes(lin,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s]);
880 if (times!=0)
882 time_local+=MPI_Wtime()-time;
884 /*send, recv*/
885 time=MPI_Wtime();
888 #ifdef FFT5D_MPI_TRANSPOSE
889 FFTW(execute)(mpip[s]);
890 #else
891 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
892 MPI_Alltoall(lin,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
893 else
894 MPI_Alltoall(lin,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
895 #endif /*FFT5D_MPI_TRANSPOSE*/
896 if (times!=0)
897 time_mpi[s]=MPI_Wtime()-time;
899 #endif /*GMX_MPI*/
902 if (times!=0)
903 time=MPI_Wtime();
904 /*bring back in matrix form
905 thus make new 1. axes contiguos
906 also local transpose 1 and 2/3 */
907 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
908 joinAxesTrans13(lin,lout,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
909 else
910 joinAxesTrans12(lin,lout,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
911 if (times!=0)
912 time_local+=MPI_Wtime()-time;
914 if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
916 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
919 if (times!=0)
920 time=MPI_Wtime();
921 if (plan->flags&FFT5D_INPLACE) lout=lin;
922 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
923 gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
924 } else {
925 gmx_fft_many_1d( p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin,lout);
928 if (times!=0)
929 time_fft+=MPI_Wtime()-time;
930 if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
931 /*if (debug) print_localdata(lout, "%d %d: FFT in y\n", N1, M, K0, YZX, coor);*/
933 if (times!=0) {
934 times->fft+=time_fft;
935 times->local+=time_local;
936 times->mpi2+=time_mpi[1];
937 times->mpi1+=time_mpi[0];
941 void fft5d_destroy(fft5d_plan plan) {
942 int s;
943 for (s=0;s<3;s++) {
944 gmx_many_fft_destroy(plan->p1d[s]);
945 if (plan->iNin[s]) {
946 free(plan->iNin[s]);
947 plan->iNin[s]=0;
949 if (plan->oNin[s]) {
950 free(plan->oNin[s]);
951 plan->oNin[s]=0;
953 if (plan->iNout[s]) {
954 free(plan->iNout[s]);
955 plan->iNout[s]=0;
957 if (plan->oNout[s]) {
958 free(plan->oNout[s]);
959 plan->oNout[s]=0;
962 #ifdef GMX_FFT_FFTW3
963 FFTW_LOCK;
964 #ifdef FFT5D_MPI_TRANSPOS
965 for (s=0;s<2;s++)
966 FFTW(destroy_plan)(plan->mpip[s]);
967 #endif /* FFT5D_MPI_TRANSPOS */
968 #endif /* GMX_FFT_FFTW3 */
970 /*We can't free lin/lout here - is allocated by gmx_calloc_aligned which can't be freed*/
973 #ifdef FFT5D_THREADS
974 #ifdef FFT5D_FFTW_THREADS
975 FFTW(cleanup_threads)();
976 #endif
977 #endif
979 free(plan);
982 /*Is this better than direct access of plan? enough data?
983 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
984 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
985 *N1=plan->N[0];
986 *M0=plan->M[0];
987 *K1=plan->K[0];
988 *K0=plan->N[1];
990 *coor=plan->coor;
994 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
995 of processor dimensions*/
996 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout) {
997 MPI_Comm cart[2]={0};
998 #ifdef GMX_MPI
999 int size=1,prank=0;
1000 int P[2];
1001 int coor[2];
1002 int wrap[]={0,0};
1003 MPI_Comm gcart;
1004 int rdim1[] = {0,1}, rdim2[] = {1,0};
1006 MPI_Comm_size(comm,&size);
1007 MPI_Comm_rank(comm,&prank);
1009 if (P0==0) P0 = lfactor(size);
1010 if (size%P0!=0) {
1011 if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
1012 P0 = lfactor(size);
1015 P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
1017 /*Difference between x-y-z regarding 2d decomposition is whether they are
1018 distributed along axis 1, 2 or both*/
1020 MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
1021 MPI_Cart_get(gcart,2,P,wrap,coor);
1022 MPI_Cart_sub(gcart, rdim1 , &cart[0]);
1023 MPI_Cart_sub(gcart, rdim2 , &cart[1]);
1024 #endif
1025 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout);
1030 /*prints in original coordinate system of data (as the input to FFT)*/
1031 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
1032 int xs[3],xl[3],xc[3],NG[3];
1033 int x,y,z,l;
1034 int *coor = plan->coor;
1035 int ll=2; /*compare ll values per element (has to be 2 for complex)*/
1036 if (plan->flags&FFT5D_REALCOMPLEX && plan->flags&FFT5D_BACKWARD)
1038 ll=1;
1041 compute_offsets(plan,xs,xl,xc,NG,2);
1042 if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
1043 for (z=0;z<xl[2];z++) {
1044 for(y=0;y<xl[1];y++) {
1045 if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1046 for (x=0;x<xl[0];x++) {
1047 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1048 real a,b;
1049 a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1050 if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1051 if (!bothLocal)
1052 b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1053 else
1054 b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1055 if (plan->flags&FFT5D_DEBUG) {
1056 printf("%f %f, ",a,b);
1057 } else {
1058 if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1059 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1061 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1064 if (plan->flags&FFT5D_DEBUG) printf(",");
1066 if (plan->flags&FFT5D_DEBUG) printf("\n");