1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
13 #define GMX_PARALLEL_ENV_INITIALIZED 1
16 #define GMX_PARALLEL_ENV_INITIALIZED gmx_parallel_env_initialized()
35 #ifndef __FLT_EPSILON__
36 #define __FLT_EPSILON__ FLT_EPSILON
37 #define __DBL_EPSILON__ DBL_EPSILON
44 #include "gmx_fatal.h"
49 /* none of the fftw3 calls, except execute(), are thread-safe, so
50 we need to serialize them with this mutex. */
51 static tMPI_Thread_mutex_t big_fftw_mutex
=TMPI_THREAD_MUTEX_INITIALIZER
;
53 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex);
54 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex);
55 #else /* GMX_THREADS */
58 #endif /* GMX_THREADS */
59 #endif /* GMX_FFT_FFTW3 */
61 static double fft5d_fmax(double a
, double b
){
65 /* largest factor smaller than sqrt */
66 static int lfactor(int z
) {
73 static int l2factor(int z
) {
80 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
81 static int lpfactor(int z
) {
84 return fft5d_fmax(lpfactor(f
),lpfactor(z
/f
));
88 #ifdef HAVE_GETTIMEOFDAY
93 return tv
.tv_sec
+tv
.tv_usec
*1e-6;
102 static int vmax(int* a
, int s
) {
106 if (a
[i
]>max
) max
=a
[i
];
112 copied here from fftgrid, because:
113 1. function there not publically available
114 2. not sure whether we keep fftgrid
115 3. less dependencies for fft5d
117 Only used for non-fftw case
120 gmx_calloc_aligned(size_t size
)
124 /*We initialize by zero for Valgrind
125 For non-divisible case we communicate more than the data.
126 If we don't initialize the data we communicate uninitialized data*/
127 p0
= calloc(size
+32,1);
131 gmx_fatal(FARGS
,"Failed to allocated %u bytes of aligned memory.",size
+32);
134 p
= (void *) (((size_t) p0
+ 32) & (~((size_t) 31)));
136 /* Yeah, yeah, we cannot free this pointer, but who cares... */
141 /* NxMxK the size of the data
142 * comm communicator to use for fft5d
143 * P0 number of processor in 1st axes (can be null for automatic)
144 * lin is allocated by fft5d because size of array is only known after planning phase */
145 fft5d_plan
fft5d_plan_3d(int NG
, int MG
, int KG
, MPI_Comm comm
[2], int flags
, t_complex
** rlin
, t_complex
** rlout
)
148 int P
[2],bMaster
,prank
[2],i
;
150 int *N0
=0, *N1
=0, *M0
=0, *M1
=0, *K0
=0, *K1
=0, *oN0
=0, *oN1
=0, *oM0
=0, *oM1
=0, *oK0
=0, *oK1
=0;
151 int N
[3],M
[3],K
[3],pN
[3],pM
[3],pK
[3],oM
[3],oK
[3],*iNin
[3]={0},*oNin
[3]={0},*iNout
[3]={0},*oNout
[3]={0};
152 int C
[3],rC
[3],nP
[2];
154 t_complex
*lin
=0,*lout
=0;
158 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
160 if (GMX_PARALLEL_ENV_INITIALIZED
&& comm
[0] != 0)
162 MPI_Comm_size(comm
[0],&P
[0]);
163 MPI_Comm_rank(comm
[0],&prank
[0]);
172 if (GMX_PARALLEL_ENV_INITIALIZED
&& comm
[1] != 0)
174 MPI_Comm_size(comm
[1],&P
[1]);
175 MPI_Comm_rank(comm
[1],&prank
[1]);
184 bMaster
=(prank
[0]==0&&prank
[1]==0);
189 fprintf(debug
,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
190 P
[0],P
[1],prank
[0],prank
[1]);
195 fprintf(debug
,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
196 NG
,MG
,KG
,P
[0],P
[1],(flags
&FFT5D_REALCOMPLEX
)>0,(flags
&FFT5D_BACKWARD
)>0,(flags
&FFT5D_ORDER_YZ
)>0,(flags
&FFT5D_DEBUG
)>0);
197 /* The check below is not correct, one prime factor 11 or 13 is ok.
198 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
199 printf("WARNING: FFT very slow with prime factors larger 7\n");
200 printf("Change FFT size or in case you cannot change it look at\n");
201 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
206 if (NG
==0 || MG
==0 || KG
==0) {
207 if (bMaster
) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
211 rNG
=NG
;rMG
=MG
;rKG
=KG
;
213 if (flags
&FFT5D_REALCOMPLEX
) {
214 if (!(flags
&FFT5D_BACKWARD
)) NG
= NG
/2+1;
216 if (!(flags
&FFT5D_ORDER_YZ
)) MG
=MG
/2+1;
222 /*for transpose we need to know the size for each processor not only our own size*/
224 N0
= (int*)malloc(P
[0]*sizeof(int)); N1
= (int*)malloc(P
[1]*sizeof(int));
225 M0
= (int*)malloc(P
[0]*sizeof(int)); M1
= (int*)malloc(P
[1]*sizeof(int));
226 K0
= (int*)malloc(P
[0]*sizeof(int)); K1
= (int*)malloc(P
[1]*sizeof(int));
227 oN0
= (int*)malloc(P
[0]*sizeof(int));oN1
= (int*)malloc(P
[1]*sizeof(int));
228 oM0
= (int*)malloc(P
[0]*sizeof(int));oM1
= (int*)malloc(P
[1]*sizeof(int));
229 oK0
= (int*)malloc(P
[0]*sizeof(int));oK1
= (int*)malloc(P
[1]*sizeof(int));
235 oN0
[i
]=i
*ceil((double)NG
/P
[0]);
236 oM0
[i
]=i
*ceil((double)MG
/P
[0]);
237 oK0
[i
]=i
*ceil((double)KG
/P
[0]);
247 oN1
[i
]=i
*ceil((double)NG
/P
[1]);
248 oM1
[i
]=i
*ceil((double)MG
/P
[1]);
249 oK1
[i
]=i
*ceil((double)KG
/P
[1]);
256 for (i
=0;i
<P
[0]-1;i
++)
258 N0
[i
]=oN0
[i
+1]-oN0
[i
];
259 M0
[i
]=oM0
[i
+1]-oM0
[i
];
260 K0
[i
]=oK0
[i
+1]-oK0
[i
];
262 N0
[P
[0]-1]=NG
-oN0
[P
[0]-1];
263 M0
[P
[0]-1]=MG
-oM0
[P
[0]-1];
264 K0
[P
[0]-1]=KG
-oK0
[P
[0]-1];
265 for (i
=0;i
<P
[1]-1;i
++)
267 N1
[i
]=oN1
[i
+1]-oN1
[i
];
268 M1
[i
]=oM1
[i
+1]-oM1
[i
];
269 K1
[i
]=oK1
[i
+1]-oK1
[i
];
271 N1
[P
[1]-1]=NG
-oN1
[P
[1]-1];
272 M1
[P
[1]-1]=MG
-oM1
[P
[1]-1];
273 K1
[P
[1]-1]=KG
-oK1
[P
[1]-1];
275 /* for step 1-3 the local N,M,K sizes of the transposed system
276 C: contiguous dimension, and nP: number of processor in subcommunicator
280 pM
[0] = M0
[prank
[0]];
281 oM
[0] = oM0
[prank
[0]];
282 pK
[0] = K1
[prank
[1]];
283 oK
[0] = oK1
[prank
[1]];
286 if (!(flags
&FFT5D_ORDER_YZ
)) {
287 N
[0] = vmax(N1
,P
[1]);
289 K
[0] = vmax(K1
,P
[1]);
290 pN
[0] = N1
[prank
[1]];
296 N
[1] = vmax(K0
,P
[0]);
297 pN
[1] = K0
[prank
[0]];
302 M
[1] = vmax(M0
,P
[0]);
303 pM
[1] = M0
[prank
[0]];
304 oM
[1] = oM0
[prank
[0]];
306 pK
[1] = N1
[prank
[1]];
307 oK
[1] = oN1
[prank
[1]];
313 M
[2] = vmax(K0
,P
[0]);
314 pM
[2] = K0
[prank
[0]];
315 oM
[2] = oK0
[prank
[0]];
316 K
[2] = vmax(N1
,P
[1]);
317 pK
[2] = N1
[prank
[1]];
318 oK
[2] = oN1
[prank
[1]];
319 free(N0
); free(oN0
); /*these are not used for this order*/
320 free(M1
); free(oM1
); /*the rest is freed in destroy*/
322 N
[0] = vmax(N0
,P
[0]);
323 M
[0] = vmax(M0
,P
[0]);
325 pN
[0] = N0
[prank
[0]];
331 N
[1] = vmax(M1
,P
[1]);
332 pN
[1] = M1
[prank
[1]];
338 pM
[1] = N0
[prank
[0]];
339 oM
[1] = oN0
[prank
[0]];
340 K
[1] = vmax(K1
,P
[1]);
341 pK
[1] = K1
[prank
[1]];
342 oK
[1] = oK1
[prank
[1]];
348 M
[2] = vmax(N0
,P
[0]);
349 pM
[2] = N0
[prank
[0]];
350 oM
[2] = oN0
[prank
[0]];
351 K
[2] = vmax(M1
,P
[1]);
352 pK
[2] = M1
[prank
[1]];
353 oK
[2] = oM1
[prank
[1]];
354 free(N1
); free(oN1
); /*these are not used for this order*/
355 free(K0
); free(oK0
); /*the rest is freed in destroy*/
359 Difference between x-y-z regarding 2d decomposition is whether they are
360 distributed along axis 1, 2 or both
363 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
364 lsize
= fft5d_fmax(N
[0]*M
[0]*K
[0]*nP
[0],fft5d_fmax(N
[1]*M
[1]*K
[1]*nP
[1],C
[2]*M
[2]*K
[2]));
365 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
366 if (!(flags
&FFT5D_NOMALLOC
)) {
367 lin
= (t_complex
*)gmx_calloc_aligned(sizeof(t_complex
) * lsize
);
368 lout
= (t_complex
*)gmx_calloc_aligned(sizeof(t_complex
) * lsize
);
374 plan
= (fft5d_plan
)calloc(1,sizeof(struct fft5d_plan_t
));
377 #ifdef FFT5D_THREADS /*requires fftw with openmp and openmp*/
378 FFTW(init_threads
)();
384 nthreads
= omp_get_num_threads();
387 printf("Running on %d threads\n",nthreads
);
388 FFTW(plan_with_nthreads
)(nthreads
);
392 #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but insead only 1D plans */
393 if ((!(flags
&FFT5D_INPLACE
)) && (!(P
[0]>1 || P
[1]>1))) { /*don't do 3d plan in parallel or if in_place requested */
394 int fftwflags
=FFTW_DESTROY_INPUT
;
396 int inNG
=NG
,outMG
=MG
,outKG
=KG
;
399 if (!(flags
&FFT5D_NOMEASURE
)) fftwflags
|=FFTW_MEASURE
;
400 if (flags
&FFT5D_REALCOMPLEX
) {
401 if (!(flags
&FFT5D_BACKWARD
)) { /*input pointer is not complex*/
403 } else { /*output pointer is not complex*/
404 if (!(flags
&FFT5D_ORDER_YZ
)) outMG
*=2;
409 if (!(flags
&FFT5D_BACKWARD
)) {
414 dims
[0].is
= inNG
*MG
; /*N M K*/
417 if (!(flags
&FFT5D_ORDER_YZ
)) {
418 dims
[0].os
= MG
; /*M K N*/
422 dims
[0].os
= 1; /*K N M*/
427 if (!(flags
&FFT5D_ORDER_YZ
)) {
436 dims
[0].os
= outMG
*KG
;
448 dims
[0].os
= outKG
*NG
;
453 if ((flags
&FFT5D_REALCOMPLEX
) && !(flags
&FFT5D_BACKWARD
)) {
454 plan
->p3d
= FFTW(plan_guru_dft_r2c
)(/*rank*/ 3, dims
,
455 /*howmany*/ 0, /*howmany_dims*/0 ,
456 (real
*)lin
, (FFTW(complex) *)lout
,
457 /*flags*/ fftwflags
);
458 } else if ((flags
&FFT5D_REALCOMPLEX
) && (flags
&FFT5D_BACKWARD
)) {
459 plan
->p3d
= FFTW(plan_guru_dft_c2r
)(/*rank*/ 3, dims
,
460 /*howmany*/ 0, /*howmany_dims*/0 ,
461 (FFTW(complex) *)lin
, (real
*)lout
,
462 /*flags*/ fftwflags
);
464 plan
->p3d
= FFTW(plan_guru_dft
)(/*rank*/ 3, dims
,
465 /*howmany*/ 0, /*howmany_dims*/0 ,
466 (FFTW(complex) *)lin
, (FFTW(complex) *)lout
,
467 /*sign*/ (flags
&FFT5D_BACKWARD
)?1:-1, /*flags*/ fftwflags
);
471 if (!plan
->p3d
) { /* for decomposition and if 3d plan did not work */
472 #endif /* GMX_FFT_FFTW3 */
476 fprintf(debug
,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
477 s
,rC
[s
],M
[s
],pK
[s
],C
[s
],lsize
);
479 if ((flags
&FFT5D_REALCOMPLEX
) && ((!(flags
&FFT5D_BACKWARD
) && s
==0) || ((flags
&FFT5D_BACKWARD
) && s
==2))) {
480 gmx_fft_init_many_1d_real( &plan
->p1d
[s
], rC
[s
], pM
[s
]*pK
[s
], (flags
&FFT5D_NOMEASURE
)?GMX_FFT_FLAG_CONSERVATIVE
:0 );
482 gmx_fft_init_many_1d ( &plan
->p1d
[s
], C
[s
], pM
[s
]*pK
[s
], (flags
&FFT5D_NOMEASURE
)?GMX_FFT_FLAG_CONSERVATIVE
:0 );
488 if ((flags
&FFT5D_ORDER_YZ
)) { /*plan->cart is in the order of transposes */
489 plan
->cart
[0]=comm
[0]; plan
->cart
[1]=comm
[1];
491 plan
->cart
[1]=comm
[0]; plan
->cart
[0]=comm
[1];
493 #ifdef FFT5D_MPI_TRANSPOSE
496 if ((s
==0 && !(flags
&FFT5D_ORDER_YZ
)) || (s
==1 && (flags
&FFT5D_ORDER_YZ
)))
497 plan
->mpip
[s
] = FFTW(mpi_plan_many_transpose
)(nP
[s
], nP
[s
], N
[s
]*K
[s
]*pM
[s
]*2, 1, 1, (real
*)lin
, (real
*)lout
, plan
->cart
[s
], FFTW_PATIENT
);
499 plan
->mpip
[s
] = FFTW(mpi_plan_many_transpose
)(nP
[s
], nP
[s
], N
[s
]*pK
[s
]*M
[s
]*2, 1, 1, (real
*)lin
, (real
*)lout
, plan
->cart
[s
], FFTW_PATIENT
);
508 plan
->NG
=NG
;plan
->MG
=MG
;plan
->KG
=KG
;
511 plan
->N
[s
]=N
[s
];plan
->M
[s
]=M
[s
];plan
->K
[s
]=K
[s
];plan
->pN
[s
]=pN
[s
];plan
->pM
[s
]=pM
[s
];plan
->pK
[s
]=pK
[s
];
512 plan
->oM
[s
]=oM
[s
];plan
->oK
[s
]=oK
[s
];
513 plan
->C
[s
]=C
[s
];plan
->rC
[s
]=rC
[s
];
514 plan
->iNin
[s
]=iNin
[s
];plan
->oNin
[s
]=oNin
[s
];plan
->iNout
[s
]=iNout
[s
];plan
->oNout
[s
]=oNout
[s
];
517 plan
->P
[s
]=nP
[s
];plan
->coor
[s
]=prank
[s
];
520 /* plan->fftorder=fftorder;
521 plan->direction=direction;
522 plan->realcomplex=realcomplex;
542 /*here x,y,z and N,M,K is in rotated coordinate system!!
543 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
544 maxN,maxM,maxK is max size of local data
545 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
546 NG, MG, KG is size of global data*/
547 static void splitaxes(t_complex
* lout
,const t_complex
* lin
,
548 int maxN
,int maxM
,int maxK
, int pN
, int pM
, int pK
,
549 int P
,int NG
,int *N
, int* oN
) {
551 int in_i
,out_i
,in_z
,out_z
,in_y
,out_y
;
553 for (i
=0;i
<P
;i
++) { /*index cube along long axis*/
554 in_i
= i
*maxN
*maxM
*maxK
;
556 for (z
=0;z
<pK
;z
++) { /*3. z l*/
557 in_z
= in_i
+ z
*maxN
*maxM
;
558 out_z
= out_i
+ z
*NG
*pM
;
559 for (y
=0;y
<pM
;y
++) { /*2. y k*/
560 in_y
= in_z
+ y
*maxN
;
561 out_y
= out_z
+ y
*NG
;
562 for (x
=0;x
<N
[i
];x
++) { /*1. x j*/
563 lout
[in_y
+x
] = lin
[out_y
+x
];
564 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
565 /*before split data contiguos - thus if different processor get different amount oN is different*/
572 /*make axis contiguous again (after AllToAll) and also do local transpose*/
573 /*transpose mayor and major dimension
575 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
576 N,M,K local dimensions
578 static void joinAxesTrans13(t_complex
* lin
,const t_complex
* lout
,
579 int maxN
,int maxM
,int maxK
,int pN
, int pM
, int pK
,
580 int P
,int KG
, int* K
, int* oK
)
583 int in_i
,out_i
,in_x
,out_x
,in_z
,out_z
;
585 for (i
=0;i
<P
;i
++) { /*index cube along long axis*/
587 out_i
= i
*maxM
*maxN
*maxK
;
588 for (x
=0;x
<pN
;x
++) { /*1.j*/
589 in_x
= in_i
+ x
*KG
*pM
;
591 for (z
=0;z
<K
[i
];z
++) { /*3.l*/
593 out_z
= out_x
+ z
*maxM
*maxN
;
594 for (y
=0;y
<pM
;y
++) { /*2.k*/
595 lin
[in_z
+y
*KG
] = lout
[out_z
+y
*maxN
];
602 /*make axis contiguous again (after AllToAll) and also do local transpose
603 tranpose mayor and middle dimension
605 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
608 static void joinAxesTrans12(t_complex
* lin
,const t_complex
* lout
,int maxN
,int maxM
,int maxK
,int pN
, int pM
, int pK
,
609 int P
,int MG
, int* M
, int* oM
) {
611 int in_i
,out_i
,in_z
,out_z
,in_x
,out_x
;
613 for (i
=0;i
<P
;i
++) { /*index cube along long axis*/
615 out_i
= i
*maxM
*maxN
*maxK
;
617 in_z
= in_i
+ z
*MG
*pN
;
618 out_z
= out_i
+ z
*maxM
*maxN
;
622 for (y
=0;y
<M
[i
];y
++) {
623 lin
[in_x
+y
] = lout
[out_x
+y
*maxN
];
631 static void rotate(int x
[]) {
641 /*compute the offset to compare or print transposed local data in original input coordinates
642 xs matrix dimension size, xl dimension length, xc decomposition offset
643 s: step in computation = number of transposes*/
644 static void compute_offsets(fft5d_plan plan
, int xs
[], int xl
[], int xc
[], int NG
[], int s
) {
645 /* int direction = plan->direction;
646 int fftorder = plan->fftorder;*/
650 int *pM
=plan
->pM
, *pK
=plan
->pK
, *oM
=plan
->oM
, *oK
=plan
->oK
,
651 *C
=plan
->C
, *rC
=plan
->rC
;
653 NG
[0]=plan
->NG
;NG
[1]=plan
->MG
;NG
[2]=plan
->KG
;
655 if (!(plan
->flags
&FFT5D_ORDER_YZ
)) {
657 case 0: o
=XYZ
; break;
658 case 1: o
=ZYX
; break;
659 case 2: o
=YZX
; break;
664 case 0: o
=XYZ
; break;
665 case 1: o
=YXZ
; break;
666 case 2: o
=ZXY
; break;
672 case XYZ
:pos
[0]=1;pos
[1]=2;pos
[2]=3;break;
673 case XZY
:pos
[0]=1;pos
[1]=3;pos
[2]=2;break;
674 case YXZ
:pos
[0]=2;pos
[1]=1;pos
[2]=3;break;
675 case YZX
:pos
[0]=3;pos
[1]=1;pos
[2]=2;break;
676 case ZXY
:pos
[0]=2;pos
[1]=3;pos
[2]=1;break;
677 case ZYX
:pos
[0]=3;pos
[1]=2;pos
[2]=1;break;
679 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
681 /*xs, xl give dimension size and data length in local transposed coordinate system
682 for 0(/1/2): x(/y/z) in original coordinate system*/
685 case 1: xs
[i
]=1; xc
[i
]=0; xl
[i
]=C
[s
];break;
686 case 2: xs
[i
]=C
[s
]; xc
[i
]=oM
[s
]; xl
[i
]=pM
[s
];break;
687 case 3: xs
[i
]=C
[s
]*pM
[s
];xc
[i
]=oK
[s
]; xl
[i
]=pK
[s
];break;
690 /*input order is different for test program to match FFTW order
691 (important for complex to real)*/
692 if (plan
->flags
&FFT5D_BACKWARD
) {
697 if (plan
->flags
&FFT5D_ORDER_YZ
) {
704 if (plan
->flags
&FFT5D_REALCOMPLEX
&& ((!(plan
->flags
&FFT5D_BACKWARD
) && s
==0) || (plan
->flags
&FFT5D_BACKWARD
&& s
==2))) {
709 static void print_localdata(const t_complex
* lin
, const char* txt
, int s
, fft5d_plan plan
) {
711 int *coor
= plan
->coor
;
712 int xs
[3],xl
[3],xc
[3],NG
[3];
713 int ll
=(plan
->flags
&FFT5D_REALCOMPLEX
)?1:2;
714 compute_offsets(plan
,xs
,xl
,xc
,NG
,s
);
715 fprintf(debug
,txt
,coor
[0],coor
[1],s
);
716 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
717 for(z
=0;z
<xl
[2];z
++) {
718 for(y
=0;y
<xl
[1];y
++) {
719 fprintf(debug
,"%d %d: ",coor
[0],coor
[1]);
720 for (x
=0;x
<xl
[0];x
++) {
722 fprintf(debug
,"%f ",((real
*)lin
)[(z
*xs
[2]+y
*xs
[1])*2+(x
*xs
[0])*ll
+l
]);
731 void fft5d_execute(fft5d_plan plan
,fft5d_time times
) {
732 t_complex
*lin
= plan
->lin
;
733 t_complex
*lout
= plan
->lout
;
735 gmx_fft_t
*p1d
=plan
->p1d
;
736 #ifdef FFT5D_MPI_TRANSPOSE
737 FFTW(plan
) *mpip
=plan
->mpip
;
740 MPI_Comm
*cart
=plan
->cart
;
743 double time_fft
=0,time_local
=0,time_mpi
[2]={0},time
=0;
744 int *N
=plan
->N
,*M
=plan
->M
,*K
=plan
->K
,*pN
=plan
->pN
,*pM
=plan
->pM
,*pK
=plan
->pK
,
745 *C
=plan
->C
,*P
=plan
->P
,**iNin
=plan
->iNin
,**oNin
=plan
->oNin
,**iNout
=plan
->iNout
,**oNout
=plan
->oNout
;
753 FFTW(execute
)(plan
->p3d
);
755 times
->fft
+=MPI_Wtime()-time
;
761 if (plan
->flags
&FFT5D_DEBUG
) print_localdata(lin
, "%d %d: copy in lin\n", s
, plan
);
766 if ((plan
->flags
&FFT5D_REALCOMPLEX
) && !(plan
->flags
&FFT5D_BACKWARD
) && s
==0) {
767 gmx_fft_many_1d_real(p1d
[s
],(plan
->flags
&FFT5D_BACKWARD
)?GMX_FFT_COMPLEX_TO_REAL
:GMX_FFT_REAL_TO_COMPLEX
,lin
,lout
);
769 gmx_fft_many_1d( p1d
[s
],(plan
->flags
&FFT5D_BACKWARD
)?GMX_FFT_BACKWARD
:GMX_FFT_FORWARD
, lin
,lout
);
772 time_fft
+=MPI_Wtime()-time
;
774 if (plan
->flags
&FFT5D_DEBUG
) print_localdata(lout
, "%d %d: FFT %d\n", s
, plan
);
777 if (GMX_PARALLEL_ENV_INITIALIZED
&& cart
[s
] !=0 && P
[s
]>1 )
781 /*prepare for AllToAll
782 1. (most outer) axes (x) is split into P[s] parts of size N[s]
784 splitaxes(lin
,lout
,N
[s
],M
[s
],K
[s
], pN
[s
],pM
[s
],pK
[s
],P
[s
],C
[s
],iNout
[s
],oNout
[s
]);
788 time_local
+=MPI_Wtime()-time
;
794 #ifdef FFT5D_MPI_TRANSPOSE
795 FFTW(execute
)(mpip
[s
]);
797 if ((s
==0 && !(plan
->flags
&FFT5D_ORDER_YZ
)) || (s
==1 && (plan
->flags
&FFT5D_ORDER_YZ
)))
798 MPI_Alltoall(lin
,N
[s
]*pM
[s
]*K
[s
]*sizeof(t_complex
)/sizeof(real
),GMX_MPI_REAL
,lout
,N
[s
]*pM
[s
]*K
[s
]*sizeof(t_complex
)/sizeof(real
),GMX_MPI_REAL
,cart
[s
]);
800 MPI_Alltoall(lin
,N
[s
]*M
[s
]*pK
[s
]*sizeof(t_complex
)/sizeof(real
),GMX_MPI_REAL
,lout
,N
[s
]*M
[s
]*pK
[s
]*sizeof(t_complex
)/sizeof(real
),GMX_MPI_REAL
,cart
[s
]);
801 #endif /*FFT5D_MPI_TRANSPOSE*/
803 time_mpi
[s
]=MPI_Wtime()-time
;
810 /*bring back in matrix form
811 thus make new 1. axes contiguos
812 also local transpose 1 and 2/3 */
813 if ((s
==0 && !(plan
->flags
&FFT5D_ORDER_YZ
)) || (s
==1 && (plan
->flags
&FFT5D_ORDER_YZ
)))
814 joinAxesTrans13(lin
,lout
,N
[s
],pM
[s
],K
[s
],pN
[s
],pM
[s
],pK
[s
],P
[s
],C
[s
+1],iNin
[s
+1],oNin
[s
+1]);
816 joinAxesTrans12(lin
,lout
,N
[s
],M
[s
],pK
[s
],pN
[s
],pM
[s
],pK
[s
],P
[s
],C
[s
+1],iNin
[s
+1],oNin
[s
+1]);
818 time_local
+=MPI_Wtime()-time
;
820 if (plan
->flags
&FFT5D_DEBUG
) print_localdata(lin
, "%d %d: tranposed %d\n", s
+1, plan
);
822 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
827 if (plan
->flags
&FFT5D_INPLACE
) lout
=lin
;
828 if ((plan
->flags
&FFT5D_REALCOMPLEX
) && (plan
->flags
&FFT5D_BACKWARD
)) {
829 gmx_fft_many_1d_real(p1d
[s
],(plan
->flags
&FFT5D_BACKWARD
)?GMX_FFT_COMPLEX_TO_REAL
:GMX_FFT_REAL_TO_COMPLEX
,lin
,lout
);
831 gmx_fft_many_1d( p1d
[s
],(plan
->flags
&FFT5D_BACKWARD
)?GMX_FFT_BACKWARD
:GMX_FFT_FORWARD
, lin
,lout
);
835 time_fft
+=MPI_Wtime()-time
;
836 if (plan
->flags
&FFT5D_DEBUG
) print_localdata(lout
, "%d %d: FFT %d\n", s
, plan
);
837 /*if (debug) print_localdata(lout, "%d %d: FFT in y\n", N1, M, K0, YZX, coor);*/
840 times
->fft
+=time_fft
;
841 times
->local
+=time_local
;
842 times
->mpi2
+=time_mpi
[1];
843 times
->mpi1
+=time_mpi
[0];
847 void fft5d_destroy(fft5d_plan plan
) {
850 gmx_many_fft_destroy(plan
->p1d
[s
]);
859 if (plan
->iNout
[s
]) {
860 free(plan
->iNout
[s
]);
863 if (plan
->oNout
[s
]) {
864 free(plan
->oNout
[s
]);
870 #ifdef FFT5D_MPI_TRANSPOS
872 FFTW(destroy_plan
)(plan
->mpip
[s
]);
873 #endif /* FFT5D_MPI_TRANSPOS */
874 #endif /* GMX_FFT_FFTW3 */
876 /*We can't free lin/lout here - is allocated by gmx_calloc_aligned which can't be freed*/
880 FFTW(cleanup_threads
)();
886 /*Is this better than direct access of plan? enough data?
887 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
888 void fft5d_local_size(fft5d_plan plan
,int* N1
,int* M0
,int* K0
,int* K1
,int** coor
) {
898 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
899 of processor dimensions*/
900 fft5d_plan
fft5d_plan_3d_cart(int NG
, int MG
, int KG
, MPI_Comm comm
, int P0
, int flags
, t_complex
** rlin
, t_complex
** rlout
) {
901 MPI_Comm cart
[2]={0};
908 int rdim1
[] = {0,1}, rdim2
[] = {1,0};
910 MPI_Comm_size(comm
,&size
);
911 MPI_Comm_rank(comm
,&prank
);
913 if (P0
==0) P0
= lfactor(size
);
915 if (prank
==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size
,P0
);
919 P
[0] = P0
; P
[1]=size
/P0
; /*number of processors in the two dimensions*/
921 /*Difference between x-y-z regarding 2d decomposition is whether they are
922 distributed along axis 1, 2 or both*/
924 MPI_Cart_create(comm
,2,P
,wrap
,1,&gcart
); /*parameter 4: value 1: reorder*/
925 MPI_Cart_get(gcart
,2,P
,wrap
,coor
);
926 MPI_Cart_sub(gcart
, rdim1
, &cart
[0]);
927 MPI_Cart_sub(gcart
, rdim2
, &cart
[1]);
929 return fft5d_plan_3d(NG
, MG
, KG
, cart
, flags
, rlin
, rlout
);
934 /*prints in original coordinate system of data (as the input to FFT)*/
935 void fft5d_compare_data(const t_complex
* lin
, const t_complex
* in
, fft5d_plan plan
, int bothLocal
, int normalize
) {
936 int xs
[3],xl
[3],xc
[3],NG
[3];
938 int *coor
= plan
->coor
;
939 int ll
=2; /*compare ll values per element (has to be 2 for complex)*/
940 if (plan
->flags
&FFT5D_REALCOMPLEX
&& plan
->flags
&FFT5D_BACKWARD
)
945 compute_offsets(plan
,xs
,xl
,xc
,NG
,2);
946 if (plan
->flags
&FFT5D_DEBUG
) printf("Compare2\n");
947 for (z
=0;z
<xl
[2];z
++) {
948 for(y
=0;y
<xl
[1];y
++) {
949 if (plan
->flags
&FFT5D_DEBUG
) printf("%d %d: ",coor
[0],coor
[1]);
950 for (x
=0;x
<xl
[0];x
++) {
951 for (l
=0;l
<ll
;l
++) { /*loop over real/complex parts*/
953 a
=((real
*)lin
)[(z
*xs
[2]+y
*xs
[1])*2+x
*xs
[0]*ll
+l
];
954 if (normalize
) a
/=plan
->rC
[0]*plan
->rC
[1]*plan
->rC
[2];
956 b
=((real
*)in
)[((z
+xc
[2])*NG
[0]*NG
[1]+(y
+xc
[1])*NG
[0])*2+(x
+xc
[0])*ll
+l
];
958 b
=((real
*)in
)[(z
*xs
[2]+y
*xs
[1])*2+x
*xs
[0]*ll
+l
];
959 if (plan
->flags
&FFT5D_DEBUG
) {
960 printf("%f %f, ",a
,b
);
962 if (fabs(a
-b
)>2*NG
[0]*NG
[1]*NG
[2]*GMX_REAL_EPS
) {
963 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor
[0],coor
[1],x
,y
,z
,a
,b
);
965 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
968 if (plan
->flags
&FFT5D_DEBUG
) printf(",");
970 if (plan
->flags
&FFT5D_DEBUG
) printf("\n");