Fixed include statements such that double precision version of genborn.c
[gromacs/rigid-bodies.git] / src / mdlib / fft5d.c
blob3734625b814aadc203ca9aa24a5a804e53d51350
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2 */
4 #ifdef HAVE_CONFIG_H
5 #include <config.h>
6 #endif
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <string.h>
12 #ifdef NOGMX
13 #define GMX_PARALLEL_ENV_INITIALIZED 1
14 #else
15 #include "main.h"
16 #define GMX_PARALLEL_ENV_INITIALIZED gmx_parallel_env_initialized()
17 #endif
19 #ifdef GMX_LIB_MPI
20 #include <mpi.h>
21 #endif
22 #ifdef GMX_THREADS
23 #include "tmpi.h"
24 #endif
26 #ifdef FFT5D_THREADS
27 #include <omp.h>
28 #endif
30 #include "fft5d.h"
31 #include <float.h>
32 #include <math.h>
33 #include <assert.h>
35 #ifndef __FLT_EPSILON__
36 #define __FLT_EPSILON__ FLT_EPSILON
37 #define __DBL_EPSILON__ DBL_EPSILON
38 #endif
40 #ifdef NOGMX
41 FILE* debug=0;
42 #endif
44 #include "gmx_fatal.h"
47 #ifdef GMX_FFT_FFTW3
48 #ifdef GMX_THREADS
49 /* none of the fftw3 calls, except execute(), are thread-safe, so
50 we need to serialize them with this mutex. */
51 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
53 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex);
54 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex);
55 #else /* GMX_THREADS */
56 #define FFTW_LOCK
57 #define FFTW_UNLOCK
58 #endif /* GMX_THREADS */
59 #endif /* GMX_FFT_FFTW3 */
61 static double fft5d_fmax(double a, double b){
62 return (a>b)?a:b;
65 /* largest factor smaller than sqrt */
66 static int lfactor(int z) {
67 int i;
68 for (i=sqrt(z);;i--)
69 if (z%i==0) return i;
72 /* largest factor */
73 static int l2factor(int z) {
74 int i;
75 if (z==1) return 1;
76 for (i=z/2;;i--)
77 if (z%i==0) return i;
80 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
81 static int lpfactor(int z) {
82 int f = l2factor(z);
83 if (f==1) return z;
84 return fft5d_fmax(lpfactor(f),lpfactor(z/f));
87 #ifndef GMX_MPI
88 #ifdef HAVE_GETTIMEOFDAY
89 #include <sys/time.h>
90 double MPI_Wtime() {
91 struct timeval tv;
92 gettimeofday(&tv,0);
93 return tv.tv_sec+tv.tv_usec*1e-6;
95 #else
96 double MPI_Wtime() {
97 return 0.0;
99 #endif
100 #endif
102 static int vmax(int* a, int s) {
103 int i,max=0;
104 for (i=0;i<s;i++)
106 if (a[i]>max) max=a[i];
108 return max;
112 copied here from fftgrid, because:
113 1. function there not publically available
114 2. not sure whether we keep fftgrid
115 3. less dependencies for fft5d
117 Only used for non-fftw case
119 static void *
120 gmx_calloc_aligned(size_t size)
122 void *p0,*p;
124 /*We initialize by zero for Valgrind
125 For non-divisible case we communicate more than the data.
126 If we don't initialize the data we communicate uninitialized data*/
127 p0 = calloc(size+32,1);
129 if(p0 == NULL)
131 gmx_fatal(FARGS,"Failed to allocated %u bytes of aligned memory.",size+32);
134 p = (void *) (((size_t) p0 + 32) & (~((size_t) 31)));
136 /* Yeah, yeah, we cannot free this pointer, but who cares... */
137 return p;
141 /* NxMxK the size of the data
142 * comm communicator to use for fft5d
143 * P0 number of processor in 1st axes (can be null for automatic)
144 * lin is allocated by fft5d because size of array is only known after planning phase */
145 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout)
148 int P[2],bMaster,prank[2],i;
149 int rNG,rMG,rKG;
150 int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
151 int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
152 int C[3],rC[3],nP[2];
153 int lsize;
154 t_complex *lin=0,*lout=0;
155 fft5d_plan plan;
156 int s;
158 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
159 #ifdef GMX_MPI
160 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != 0)
162 MPI_Comm_size(comm[0],&P[0]);
163 MPI_Comm_rank(comm[0],&prank[0]);
165 else
166 #endif
168 P[0] = 1;
169 prank[0] = 0;
171 #ifdef GMX_MPI
172 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != 0)
174 MPI_Comm_size(comm[1],&P[1]);
175 MPI_Comm_rank(comm[1],&prank[1]);
177 else
178 #endif
180 P[1] = 1;
181 prank[1] = 0;
184 bMaster=(prank[0]==0&&prank[1]==0);
187 if (debug)
189 fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
190 P[0],P[1],prank[0],prank[1]);
193 if (bMaster) {
194 if (debug)
195 fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
196 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
197 /* The check below is not correct, one prime factor 11 or 13 is ok.
198 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
199 printf("WARNING: FFT very slow with prime factors larger 7\n");
200 printf("Change FFT size or in case you cannot change it look at\n");
201 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
206 if (NG==0 || MG==0 || KG==0) {
207 if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
208 return 0;
211 rNG=NG;rMG=MG;rKG=KG;
213 if (flags&FFT5D_REALCOMPLEX) {
214 if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
215 else {
216 if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
217 else KG=KG/2+1;
222 /*for transpose we need to know the size for each processor not only our own size*/
224 N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
225 M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
226 K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
227 oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
228 oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
229 oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
231 for (i=0;i<P[0];i++)
233 #define EVENDIST
234 #ifndef EVENDIST
235 oN0[i]=i*ceil((double)NG/P[0]);
236 oM0[i]=i*ceil((double)MG/P[0]);
237 oK0[i]=i*ceil((double)KG/P[0]);
238 #else
239 oN0[i]=(NG*i)/P[0];
240 oM0[i]=(MG*i)/P[0];
241 oK0[i]=(KG*i)/P[0];
242 #endif
244 for (i=0;i<P[1];i++)
246 #ifndef EVENDIST
247 oN1[i]=i*ceil((double)NG/P[1]);
248 oM1[i]=i*ceil((double)MG/P[1]);
249 oK1[i]=i*ceil((double)KG/P[1]);
250 #else
251 oN1[i]=(NG*i)/P[1];
252 oM1[i]=(MG*i)/P[1];
253 oK1[i]=(KG*i)/P[1];
254 #endif
256 for (i=0;i<P[0]-1;i++)
258 N0[i]=oN0[i+1]-oN0[i];
259 M0[i]=oM0[i+1]-oM0[i];
260 K0[i]=oK0[i+1]-oK0[i];
262 N0[P[0]-1]=NG-oN0[P[0]-1];
263 M0[P[0]-1]=MG-oM0[P[0]-1];
264 K0[P[0]-1]=KG-oK0[P[0]-1];
265 for (i=0;i<P[1]-1;i++)
267 N1[i]=oN1[i+1]-oN1[i];
268 M1[i]=oM1[i+1]-oM1[i];
269 K1[i]=oK1[i+1]-oK1[i];
271 N1[P[1]-1]=NG-oN1[P[1]-1];
272 M1[P[1]-1]=MG-oM1[P[1]-1];
273 K1[P[1]-1]=KG-oK1[P[1]-1];
275 /* for step 1-3 the local N,M,K sizes of the transposed system
276 C: contiguous dimension, and nP: number of processor in subcommunicator
277 for that step */
280 pM[0] = M0[prank[0]];
281 oM[0] = oM0[prank[0]];
282 pK[0] = K1[prank[1]];
283 oK[0] = oK1[prank[1]];
284 C[0] = NG;
285 rC[0] = rNG;
286 if (!(flags&FFT5D_ORDER_YZ)) {
287 N[0] = vmax(N1,P[1]);
288 M[0] = M0[prank[0]];
289 K[0] = vmax(K1,P[1]);
290 pN[0] = N1[prank[1]];
291 iNout[0] = N1;
292 oNout[0] = oN1;
293 nP[0] = P[1];
294 C[1] = KG;
295 rC[1] =rKG;
296 N[1] = vmax(K0,P[0]);
297 pN[1] = K0[prank[0]];
298 iNin[1] = K1;
299 oNin[1] = oK1;
300 iNout[1] = K0;
301 oNout[1] = oK0;
302 M[1] = vmax(M0,P[0]);
303 pM[1] = M0[prank[0]];
304 oM[1] = oM0[prank[0]];
305 K[1] = N1[prank[1]];
306 pK[1] = N1[prank[1]];
307 oK[1] = oN1[prank[1]];
308 nP[1] = P[0];
309 C[2] = MG;
310 rC[2] = rMG;
311 iNin[2] = M0;
312 oNin[2] = oM0;
313 M[2] = vmax(K0,P[0]);
314 pM[2] = K0[prank[0]];
315 oM[2] = oK0[prank[0]];
316 K[2] = vmax(N1,P[1]);
317 pK[2] = N1[prank[1]];
318 oK[2] = oN1[prank[1]];
319 free(N0); free(oN0); /*these are not used for this order*/
320 free(M1); free(oM1); /*the rest is freed in destroy*/
321 } else {
322 N[0] = vmax(N0,P[0]);
323 M[0] = vmax(M0,P[0]);
324 K[0] = K1[prank[1]];
325 pN[0] = N0[prank[0]];
326 iNout[0] = N0;
327 oNout[0] = oN0;
328 nP[0] = P[0];
329 C[1] = MG;
330 rC[1] =rMG;
331 N[1] = vmax(M1,P[1]);
332 pN[1] = M1[prank[1]];
333 iNin[1] = M0;
334 oNin[1] = oM0;
335 iNout[1] = M1;
336 oNout[1] = oM1;
337 M[1] = N0[prank[0]];
338 pM[1] = N0[prank[0]];
339 oM[1] = oN0[prank[0]];
340 K[1] = vmax(K1,P[1]);
341 pK[1] = K1[prank[1]];
342 oK[1] = oK1[prank[1]];
343 nP[1] = P[1];
344 C[2] = KG;
345 rC[2] = rKG;
346 iNin[2] = K1;
347 oNin[2] = oK1;
348 M[2] = vmax(N0,P[0]);
349 pM[2] = N0[prank[0]];
350 oM[2] = oN0[prank[0]];
351 K[2] = vmax(M1,P[1]);
352 pK[2] = M1[prank[1]];
353 oK[2] = oM1[prank[1]];
354 free(N1); free(oN1); /*these are not used for this order*/
355 free(K0); free(oK0); /*the rest is freed in destroy*/
359 Difference between x-y-z regarding 2d decomposition is whether they are
360 distributed along axis 1, 2 or both
363 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
364 lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
365 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
366 if (!(flags&FFT5D_NOMALLOC)) {
367 lin = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
368 lout = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
369 } else {
370 lin = *rlin;
371 lout = *rlout;
374 plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
377 #ifdef FFT5D_THREADS /*requires fftw with openmp and openmp*/
378 FFTW(init_threads)();
379 int nthreads;
380 #pragma omp parallel
382 #pragma omp master
384 nthreads = omp_get_num_threads();
387 printf("Running on %d threads\n",nthreads);
388 FFTW(plan_with_nthreads)(nthreads);
390 #endif
392 #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but insead only 1D plans */
393 if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1))) { /*don't do 3d plan in parallel or if in_place requested */
394 int fftwflags=FFTW_DESTROY_INPUT;
395 fftw_iodim dims[3];
396 int inNG=NG,outMG=MG,outKG=KG;
398 FFTW_LOCK;
399 if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
400 if (flags&FFT5D_REALCOMPLEX) {
401 if (!(flags&FFT5D_BACKWARD)) { /*input pointer is not complex*/
402 inNG*=2;
403 } else { /*output pointer is not complex*/
404 if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
405 else outKG*=2;
409 if (!(flags&FFT5D_BACKWARD)) {
410 dims[0].n = KG;
411 dims[1].n = MG;
412 dims[2].n = rNG;
414 dims[0].is = inNG*MG; /*N M K*/
415 dims[1].is = inNG;
416 dims[2].is = 1;
417 if (!(flags&FFT5D_ORDER_YZ)) {
418 dims[0].os = MG; /*M K N*/
419 dims[1].os = 1;
420 dims[2].os = MG*KG;
421 } else {
422 dims[0].os = 1; /*K N M*/
423 dims[1].os = KG*NG;
424 dims[2].os = KG;
426 } else {
427 if (!(flags&FFT5D_ORDER_YZ)) {
428 dims[0].n = NG;
429 dims[1].n = KG;
430 dims[2].n = rMG;
432 dims[0].is = 1;
433 dims[1].is = NG*MG;
434 dims[2].is = NG;
436 dims[0].os = outMG*KG;
437 dims[1].os = outMG;
438 dims[2].os = 1;
439 } else {
440 dims[0].n = MG;
441 dims[1].n = NG;
442 dims[2].n = rKG;
444 dims[0].is = NG;
445 dims[1].is = 1;
446 dims[2].is = NG*MG;
448 dims[0].os = outKG*NG;
449 dims[1].os = outKG;
450 dims[2].os = 1;
453 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
454 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
455 /*howmany*/ 0, /*howmany_dims*/0 ,
456 (real*)lin, (FFTW(complex) *)lout,
457 /*flags*/ fftwflags);
458 } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
459 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
460 /*howmany*/ 0, /*howmany_dims*/0 ,
461 (FFTW(complex) *)lin, (real*)lout,
462 /*flags*/ fftwflags);
463 } else {
464 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
465 /*howmany*/ 0, /*howmany_dims*/0 ,
466 (FFTW(complex) *)lin, (FFTW(complex) *)lout,
467 /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
469 FFTW_UNLOCK;
471 if (!plan->p3d) { /* for decomposition and if 3d plan did not work */
472 #endif /* GMX_FFT_FFTW3 */
473 for (s=0;s<3;s++) {
474 if (debug)
476 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
477 s,rC[s],M[s],pK[s],C[s],lsize);
479 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
480 gmx_fft_init_many_1d_real( &plan->p1d[s], rC[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
481 } else {
482 gmx_fft_init_many_1d ( &plan->p1d[s], C[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
485 #ifdef GMX_FFT_FFTW3
487 #endif
488 if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
489 plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
490 } else {
491 plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
493 #ifdef FFT5D_MPI_TRANSPOSE
494 FFTW_LOCK
495 for (s=0;s<2;s++) {
496 if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
497 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
498 else
499 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
501 FFTW_UNLOCK
502 #endif
505 plan->lin=lin;
506 plan->lout=lout;
508 plan->NG=NG;plan->MG=MG;plan->KG=KG;
510 for (s=0;s<3;s++) {
511 plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
512 plan->oM[s]=oM[s];plan->oK[s]=oK[s];
513 plan->C[s]=C[s];plan->rC[s]=rC[s];
514 plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
516 for (s=0;s<2;s++) {
517 plan->P[s]=nP[s];plan->coor[s]=prank[s];
520 /* plan->fftorder=fftorder;
521 plan->direction=direction;
522 plan->realcomplex=realcomplex;
524 plan->flags=flags;
525 *rlin=lin;
526 *rlout=lout;
527 return plan;
531 enum order {
532 XYZ,
533 XZY,
534 YXZ,
535 YZX,
536 ZXY,
542 /*here x,y,z and N,M,K is in rotated coordinate system!!
543 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
544 maxN,maxM,maxK is max size of local data
545 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
546 NG, MG, KG is size of global data*/
547 static void splitaxes(t_complex* lout,const t_complex* lin,
548 int maxN,int maxM,int maxK, int pN, int pM, int pK,
549 int P,int NG,int *N, int* oN) {
550 int x,y,z,i;
551 int in_i,out_i,in_z,out_z,in_y,out_y;
553 for (i=0;i<P;i++) { /*index cube along long axis*/
554 in_i = i*maxN*maxM*maxK;
555 out_i = oN[i];
556 for (z=0;z<pK;z++) { /*3. z l*/
557 in_z = in_i + z*maxN*maxM;
558 out_z = out_i + z*NG*pM;
559 for (y=0;y<pM;y++) { /*2. y k*/
560 in_y = in_z + y*maxN;
561 out_y = out_z + y*NG;
562 for (x=0;x<N[i];x++) { /*1. x j*/
563 lout[in_y+x] = lin[out_y+x];
564 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
565 /*before split data contiguos - thus if different processor get different amount oN is different*/
572 /*make axis contiguous again (after AllToAll) and also do local transpose*/
573 /*transpose mayor and major dimension
574 variables see above
575 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
576 N,M,K local dimensions
577 KG global size*/
578 static void joinAxesTrans13(t_complex* lin,const t_complex* lout,
579 int maxN,int maxM,int maxK,int pN, int pM, int pK,
580 int P,int KG, int* K, int* oK)
582 int i,x,y,z;
583 int in_i,out_i,in_x,out_x,in_z,out_z;
585 for (i=0;i<P;i++) { /*index cube along long axis*/
586 in_i = oK[i];
587 out_i = i*maxM*maxN*maxK;
588 for (x=0;x<pN;x++) { /*1.j*/
589 in_x = in_i + x*KG*pM;
590 out_x = out_i + x;
591 for (z=0;z<K[i];z++) { /*3.l*/
592 in_z = in_x + z;
593 out_z = out_x + z*maxM*maxN;
594 for (y=0;y<pM;y++) { /*2.k*/
595 lin[in_z+y*KG] = lout[out_z+y*maxN];
602 /*make axis contiguous again (after AllToAll) and also do local transpose
603 tranpose mayor and middle dimension
604 variables see above
605 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
606 N,M,K local size
607 MG, global size*/
608 static void joinAxesTrans12(t_complex* lin,const t_complex* lout,int maxN,int maxM,int maxK,int pN, int pM, int pK,
609 int P,int MG, int* M, int* oM) {
610 int i,z,y,x;
611 int in_i,out_i,in_z,out_z,in_x,out_x;
613 for (i=0;i<P;i++) { /*index cube along long axis*/
614 in_i = oM[i];
615 out_i = i*maxM*maxN*maxK;
616 for (z=0;z<pK;z++) {
617 in_z = in_i + z*MG*pN;
618 out_z = out_i + z*maxM*maxN;
619 for (x=0;x<pN;x++) {
620 in_x = in_z + x*MG;
621 out_x = out_z + x;
622 for (y=0;y<M[i];y++) {
623 lin[in_x+y] = lout[out_x+y*maxN];
631 static void rotate(int x[]) {
632 int t=x[0];
633 /* x[0]=x[2];
634 x[2]=x[1];
635 x[1]=t;*/
636 x[0]=x[1];
637 x[1]=x[2];
638 x[2]=t;
641 /*compute the offset to compare or print transposed local data in original input coordinates
642 xs matrix dimension size, xl dimension length, xc decomposition offset
643 s: step in computation = number of transposes*/
644 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
645 /* int direction = plan->direction;
646 int fftorder = plan->fftorder;*/
648 int o;
649 int pos[3],i;
650 int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
651 *C=plan->C, *rC=plan->rC;
653 NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
655 if (!(plan->flags&FFT5D_ORDER_YZ)) {
656 switch (s) {
657 case 0: o=XYZ; break;
658 case 1: o=ZYX; break;
659 case 2: o=YZX; break;
660 default: assert(0);
662 } else {
663 switch (s) {
664 case 0: o=XYZ; break;
665 case 1: o=YXZ; break;
666 case 2: o=ZXY; break;
667 default: assert(0);
671 switch (o) {
672 case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
673 case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
674 case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
675 case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
676 case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
677 case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
679 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
681 /*xs, xl give dimension size and data length in local transposed coordinate system
682 for 0(/1/2): x(/y/z) in original coordinate system*/
683 for (i=0;i<3;i++) {
684 switch (pos[i]) {
685 case 1: xs[i]=1; xc[i]=0; xl[i]=C[s];break;
686 case 2: xs[i]=C[s]; xc[i]=oM[s]; xl[i]=pM[s];break;
687 case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
690 /*input order is different for test program to match FFTW order
691 (important for complex to real)*/
692 if (plan->flags&FFT5D_BACKWARD) {
693 rotate(xs);
694 rotate(xl);
695 rotate(xc);
696 rotate(NG);
697 if (plan->flags&FFT5D_ORDER_YZ) {
698 rotate(xs);
699 rotate(xl);
700 rotate(xc);
701 rotate(NG);
704 if (plan->flags&FFT5D_REALCOMPLEX && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || (plan->flags&FFT5D_BACKWARD && s==2))) {
705 xl[0] = rC[s];
709 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
710 int x,y,z,l;
711 int *coor = plan->coor;
712 int xs[3],xl[3],xc[3],NG[3];
713 int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
714 compute_offsets(plan,xs,xl,xc,NG,s);
715 fprintf(debug,txt,coor[0],coor[1],s);
716 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
717 for(z=0;z<xl[2];z++) {
718 for(y=0;y<xl[1];y++) {
719 fprintf(debug,"%d %d: ",coor[0],coor[1]);
720 for (x=0;x<xl[0];x++) {
721 for (l=0;l<ll;l++) {
722 fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
724 fprintf(debug,",");
726 fprintf(debug,"\n");
731 void fft5d_execute(fft5d_plan plan,fft5d_time times) {
732 t_complex *lin = plan->lin;
733 t_complex *lout = plan->lout;
735 gmx_fft_t *p1d=plan->p1d;
736 #ifdef FFT5D_MPI_TRANSPOSE
737 FFTW(plan) *mpip=plan->mpip;
738 #endif
739 #ifdef GMX_MPI
740 MPI_Comm *cart=plan->cart;
741 #endif
743 double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
744 int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
745 *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
746 int s=0;
749 #ifdef GMX_FFT_FFTW3
750 if (plan->p3d) {
751 if (times!=0)
752 time=MPI_Wtime();
753 FFTW(execute)(plan->p3d);
754 if (times!=0)
755 times->fft+=MPI_Wtime()-time;
756 return;
758 #endif
760 /*lin: x,y,z*/
761 if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: copy in lin\n", s, plan);
762 for (s=0;s<2;s++) {
763 if (times!=0)
764 time=MPI_Wtime();
766 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0) {
767 gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
768 } else {
769 gmx_fft_many_1d( p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin,lout);
771 if (times!=0)
772 time_fft+=MPI_Wtime()-time;
774 if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
776 #ifdef GMX_MPI
777 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] !=0 && P[s]>1 )
779 if (times!=0)
780 time=MPI_Wtime();
781 /*prepare for AllToAll
782 1. (most outer) axes (x) is split into P[s] parts of size N[s]
783 for sending*/
784 splitaxes(lin,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s]);
786 if (times!=0)
788 time_local+=MPI_Wtime()-time;
790 /*send, recv*/
791 time=MPI_Wtime();
794 #ifdef FFT5D_MPI_TRANSPOSE
795 FFTW(execute)(mpip[s]);
796 #else
797 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
798 MPI_Alltoall(lin,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
799 else
800 MPI_Alltoall(lin,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
801 #endif /*FFT5D_MPI_TRANSPOSE*/
802 if (times!=0)
803 time_mpi[s]=MPI_Wtime()-time;
805 #endif /*GMX_MPI*/
808 if (times!=0)
809 time=MPI_Wtime();
810 /*bring back in matrix form
811 thus make new 1. axes contiguos
812 also local transpose 1 and 2/3 */
813 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
814 joinAxesTrans13(lin,lout,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
815 else
816 joinAxesTrans12(lin,lout,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
817 if (times!=0)
818 time_local+=MPI_Wtime()-time;
820 if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
822 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
825 if (times!=0)
826 time=MPI_Wtime();
827 if (plan->flags&FFT5D_INPLACE) lout=lin;
828 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
829 gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
830 } else {
831 gmx_fft_many_1d( p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin,lout);
834 if (times!=0)
835 time_fft+=MPI_Wtime()-time;
836 if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
837 /*if (debug) print_localdata(lout, "%d %d: FFT in y\n", N1, M, K0, YZX, coor);*/
839 if (times!=0) {
840 times->fft+=time_fft;
841 times->local+=time_local;
842 times->mpi2+=time_mpi[1];
843 times->mpi1+=time_mpi[0];
847 void fft5d_destroy(fft5d_plan plan) {
848 int s;
849 for (s=0;s<3;s++) {
850 gmx_many_fft_destroy(plan->p1d[s]);
851 if (plan->iNin[s]) {
852 free(plan->iNin[s]);
853 plan->iNin[s]=0;
855 if (plan->oNin[s]) {
856 free(plan->oNin[s]);
857 plan->oNin[s]=0;
859 if (plan->iNout[s]) {
860 free(plan->iNout[s]);
861 plan->iNout[s]=0;
863 if (plan->oNout[s]) {
864 free(plan->oNout[s]);
865 plan->oNout[s]=0;
868 #ifdef GMX_FFT_FFTW3
869 FFTW_LOCK;
870 #ifdef FFT5D_MPI_TRANSPOS
871 for (s=0;s<2;s++)
872 FFTW(destroy_plan)(plan->mpip[s]);
873 #endif /* FFT5D_MPI_TRANSPOS */
874 #endif /* GMX_FFT_FFTW3 */
876 /*We can't free lin/lout here - is allocated by gmx_calloc_aligned which can't be freed*/
879 #ifdef FFT5D_THREADS
880 FFTW(cleanup_threads)();
881 #endif
883 free(plan);
886 /*Is this better than direct access of plan? enough data?
887 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
888 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
889 *N1=plan->N[0];
890 *M0=plan->M[0];
891 *K1=plan->K[0];
892 *K0=plan->N[1];
894 *coor=plan->coor;
898 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
899 of processor dimensions*/
900 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout) {
901 MPI_Comm cart[2]={0};
902 #ifdef GMX_MPI
903 int size=1,prank=0;
904 int P[2];
905 int coor[2];
906 int wrap[]={0,0};
907 MPI_Comm gcart;
908 int rdim1[] = {0,1}, rdim2[] = {1,0};
910 MPI_Comm_size(comm,&size);
911 MPI_Comm_rank(comm,&prank);
913 if (P0==0) P0 = lfactor(size);
914 if (size%P0!=0) {
915 if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
916 P0 = lfactor(size);
919 P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
921 /*Difference between x-y-z regarding 2d decomposition is whether they are
922 distributed along axis 1, 2 or both*/
924 MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
925 MPI_Cart_get(gcart,2,P,wrap,coor);
926 MPI_Cart_sub(gcart, rdim1 , &cart[0]);
927 MPI_Cart_sub(gcart, rdim2 , &cart[1]);
928 #endif
929 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout);
934 /*prints in original coordinate system of data (as the input to FFT)*/
935 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
936 int xs[3],xl[3],xc[3],NG[3];
937 int x,y,z,l;
938 int *coor = plan->coor;
939 int ll=2; /*compare ll values per element (has to be 2 for complex)*/
940 if (plan->flags&FFT5D_REALCOMPLEX && plan->flags&FFT5D_BACKWARD)
942 ll=1;
945 compute_offsets(plan,xs,xl,xc,NG,2);
946 if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
947 for (z=0;z<xl[2];z++) {
948 for(y=0;y<xl[1];y++) {
949 if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
950 for (x=0;x<xl[0];x++) {
951 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
952 real a,b;
953 a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
954 if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
955 if (!bothLocal)
956 b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
957 else
958 b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
959 if (plan->flags&FFT5D_DEBUG) {
960 printf("%f %f, ",a,b);
961 } else {
962 if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
963 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
965 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
968 if (plan->flags&FFT5D_DEBUG) printf(",");
970 if (plan->flags&FFT5D_DEBUG) printf("\n");