1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 * This source code is part of
7 * GROningen MAchine for Chemical Simulations
10 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12 * Copyright (c) 2001-2008, The GROMACS development team,
13 * check out http://www.gromacs.org for more information.
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version 2
18 * of the License, or (at your option) any later version.
20 * If you want to redistribute modifications, please consider that
21 * scientific software is very special. Version control is crucial -
22 * bugs must be traceable. We will be happy to consider code for
23 * inclusion in the official distribution, but derived work must not
24 * be called official GROMACS. Details are found in the README & COPYING
25 * files - if they are missing, get the official version at www.gromacs.org.
27 * To help us fund GROMACS development, we humbly ask that you cite
28 * the papers on the package - you can find them in the top README file.
30 * For more info, check our website at http://www.gromacs.org
33 * Groningen Machine for Chemical Simulation
45 #define GMX_PARALLEL_ENV_INITIALIZED 1
48 #define GMX_PARALLEL_ENV_INITIALIZED gmx_parallel_env_initialized()
60 /* requires fftw compiled with openmp */
61 #define FFT5D_FFTW_THREADS
69 #ifndef __FLT_EPSILON__
70 #define __FLT_EPSILON__ FLT_EPSILON
71 #define __DBL_EPSILON__ DBL_EPSILON
78 #include "gmx_fatal.h"
83 /* none of the fftw3 calls, except execute(), are thread-safe, so
84 we need to serialize them with this mutex. */
85 static tMPI_Thread_mutex_t big_fftw_mutex
=TMPI_THREAD_MUTEX_INITIALIZER
;
87 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex);
88 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex);
89 #else /* GMX_THREADS */
92 #endif /* GMX_THREADS */
93 #endif /* GMX_FFT_FFTW3 */
95 static double fft5d_fmax(double a
, double b
){
99 /* largest factor smaller than sqrt */
100 static int lfactor(int z
) {
103 if (z
%i
==0) return i
;
107 static int l2factor(int z
) {
111 if (z
%i
==0) return i
;
114 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
115 static int lpfactor(int z
) {
118 return fft5d_fmax(lpfactor(f
),lpfactor(z
/f
));
122 #ifdef HAVE_GETTIMEOFDAY
123 #include <sys/time.h>
127 return tv
.tv_sec
+tv
.tv_usec
*1e-6;
136 static int vmax(int* a
, int s
) {
140 if (a
[i
]>max
) max
=a
[i
];
146 copied here from fftgrid, because:
147 1. function there not publically available
148 2. not sure whether we keep fftgrid
149 3. less dependencies for fft5d
151 Only used for non-fftw case
154 gmx_calloc_aligned(size_t size
)
158 /*We initialize by zero for Valgrind
159 For non-divisible case we communicate more than the data.
160 If we don't initialize the data we communicate uninitialized data*/
161 p0
= calloc(size
+32,1);
165 gmx_fatal(FARGS
,"Failed to allocated %u bytes of aligned memory.",size
+32);
168 p
= (void *) (((size_t) p0
+ 32) & (~((size_t) 31)));
170 /* Yeah, yeah, we cannot free this pointer, but who cares... */
175 /* NxMxK the size of the data
176 * comm communicator to use for fft5d
177 * P0 number of processor in 1st axes (can be null for automatic)
178 * lin is allocated by fft5d because size of array is only known after planning phase */
179 fft5d_plan
fft5d_plan_3d(int NG
, int MG
, int KG
, MPI_Comm comm
[2], int flags
, t_complex
** rlin
, t_complex
** rlout
)
182 int P
[2],bMaster
,prank
[2],i
;
184 int *N0
=0, *N1
=0, *M0
=0, *M1
=0, *K0
=0, *K1
=0, *oN0
=0, *oN1
=0, *oM0
=0, *oM1
=0, *oK0
=0, *oK1
=0;
185 int N
[3],M
[3],K
[3],pN
[3],pM
[3],pK
[3],oM
[3],oK
[3],*iNin
[3]={0},*oNin
[3]={0},*iNout
[3]={0},*oNout
[3]={0};
186 int C
[3],rC
[3],nP
[2];
188 t_complex
*lin
=0,*lout
=0;
192 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
194 if (GMX_PARALLEL_ENV_INITIALIZED
&& comm
[0] != MPI_COMM_NULL
)
196 MPI_Comm_size(comm
[0],&P
[0]);
197 MPI_Comm_rank(comm
[0],&prank
[0]);
206 if (GMX_PARALLEL_ENV_INITIALIZED
&& comm
[1] != MPI_COMM_NULL
)
208 MPI_Comm_size(comm
[1],&P
[1]);
209 MPI_Comm_rank(comm
[1],&prank
[1]);
218 bMaster
=(prank
[0]==0&&prank
[1]==0);
223 fprintf(debug
,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
224 P
[0],P
[1],prank
[0],prank
[1]);
229 fprintf(debug
,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
230 NG
,MG
,KG
,P
[0],P
[1],(flags
&FFT5D_REALCOMPLEX
)>0,(flags
&FFT5D_BACKWARD
)>0,(flags
&FFT5D_ORDER_YZ
)>0,(flags
&FFT5D_DEBUG
)>0);
231 /* The check below is not correct, one prime factor 11 or 13 is ok.
232 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
233 printf("WARNING: FFT very slow with prime factors larger 7\n");
234 printf("Change FFT size or in case you cannot change it look at\n");
235 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
240 if (NG
==0 || MG
==0 || KG
==0) {
241 if (bMaster
) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
245 rNG
=NG
;rMG
=MG
;rKG
=KG
;
247 if (flags
&FFT5D_REALCOMPLEX
) {
248 if (!(flags
&FFT5D_BACKWARD
)) NG
= NG
/2+1;
250 if (!(flags
&FFT5D_ORDER_YZ
)) MG
=MG
/2+1;
256 /*for transpose we need to know the size for each processor not only our own size*/
258 N0
= (int*)malloc(P
[0]*sizeof(int)); N1
= (int*)malloc(P
[1]*sizeof(int));
259 M0
= (int*)malloc(P
[0]*sizeof(int)); M1
= (int*)malloc(P
[1]*sizeof(int));
260 K0
= (int*)malloc(P
[0]*sizeof(int)); K1
= (int*)malloc(P
[1]*sizeof(int));
261 oN0
= (int*)malloc(P
[0]*sizeof(int));oN1
= (int*)malloc(P
[1]*sizeof(int));
262 oM0
= (int*)malloc(P
[0]*sizeof(int));oM1
= (int*)malloc(P
[1]*sizeof(int));
263 oK0
= (int*)malloc(P
[0]*sizeof(int));oK1
= (int*)malloc(P
[1]*sizeof(int));
269 oN0
[i
]=i
*ceil((double)NG
/P
[0]);
270 oM0
[i
]=i
*ceil((double)MG
/P
[0]);
271 oK0
[i
]=i
*ceil((double)KG
/P
[0]);
281 oN1
[i
]=i
*ceil((double)NG
/P
[1]);
282 oM1
[i
]=i
*ceil((double)MG
/P
[1]);
283 oK1
[i
]=i
*ceil((double)KG
/P
[1]);
290 for (i
=0;i
<P
[0]-1;i
++)
292 N0
[i
]=oN0
[i
+1]-oN0
[i
];
293 M0
[i
]=oM0
[i
+1]-oM0
[i
];
294 K0
[i
]=oK0
[i
+1]-oK0
[i
];
296 N0
[P
[0]-1]=NG
-oN0
[P
[0]-1];
297 M0
[P
[0]-1]=MG
-oM0
[P
[0]-1];
298 K0
[P
[0]-1]=KG
-oK0
[P
[0]-1];
299 for (i
=0;i
<P
[1]-1;i
++)
301 N1
[i
]=oN1
[i
+1]-oN1
[i
];
302 M1
[i
]=oM1
[i
+1]-oM1
[i
];
303 K1
[i
]=oK1
[i
+1]-oK1
[i
];
305 N1
[P
[1]-1]=NG
-oN1
[P
[1]-1];
306 M1
[P
[1]-1]=MG
-oM1
[P
[1]-1];
307 K1
[P
[1]-1]=KG
-oK1
[P
[1]-1];
309 /* for step 1-3 the local N,M,K sizes of the transposed system
310 C: contiguous dimension, and nP: number of processor in subcommunicator
314 pM
[0] = M0
[prank
[0]];
315 oM
[0] = oM0
[prank
[0]];
316 pK
[0] = K1
[prank
[1]];
317 oK
[0] = oK1
[prank
[1]];
320 if (!(flags
&FFT5D_ORDER_YZ
)) {
321 N
[0] = vmax(N1
,P
[1]);
323 K
[0] = vmax(K1
,P
[1]);
324 pN
[0] = N1
[prank
[1]];
330 N
[1] = vmax(K0
,P
[0]);
331 pN
[1] = K0
[prank
[0]];
336 M
[1] = vmax(M0
,P
[0]);
337 pM
[1] = M0
[prank
[0]];
338 oM
[1] = oM0
[prank
[0]];
340 pK
[1] = N1
[prank
[1]];
341 oK
[1] = oN1
[prank
[1]];
347 M
[2] = vmax(K0
,P
[0]);
348 pM
[2] = K0
[prank
[0]];
349 oM
[2] = oK0
[prank
[0]];
350 K
[2] = vmax(N1
,P
[1]);
351 pK
[2] = N1
[prank
[1]];
352 oK
[2] = oN1
[prank
[1]];
353 free(N0
); free(oN0
); /*these are not used for this order*/
354 free(M1
); free(oM1
); /*the rest is freed in destroy*/
356 N
[0] = vmax(N0
,P
[0]);
357 M
[0] = vmax(M0
,P
[0]);
359 pN
[0] = N0
[prank
[0]];
365 N
[1] = vmax(M1
,P
[1]);
366 pN
[1] = M1
[prank
[1]];
372 pM
[1] = N0
[prank
[0]];
373 oM
[1] = oN0
[prank
[0]];
374 K
[1] = vmax(K1
,P
[1]);
375 pK
[1] = K1
[prank
[1]];
376 oK
[1] = oK1
[prank
[1]];
382 M
[2] = vmax(N0
,P
[0]);
383 pM
[2] = N0
[prank
[0]];
384 oM
[2] = oN0
[prank
[0]];
385 K
[2] = vmax(M1
,P
[1]);
386 pK
[2] = M1
[prank
[1]];
387 oK
[2] = oM1
[prank
[1]];
388 free(N1
); free(oN1
); /*these are not used for this order*/
389 free(K0
); free(oK0
); /*the rest is freed in destroy*/
393 Difference between x-y-z regarding 2d decomposition is whether they are
394 distributed along axis 1, 2 or both
397 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
398 lsize
= fft5d_fmax(N
[0]*M
[0]*K
[0]*nP
[0],fft5d_fmax(N
[1]*M
[1]*K
[1]*nP
[1],C
[2]*M
[2]*K
[2]));
399 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
400 if (!(flags
&FFT5D_NOMALLOC
)) {
401 lin
= (t_complex
*)gmx_calloc_aligned(sizeof(t_complex
) * lsize
);
402 lout
= (t_complex
*)gmx_calloc_aligned(sizeof(t_complex
) * lsize
);
408 plan
= (fft5d_plan
)calloc(1,sizeof(struct fft5d_plan_t
));
412 #ifdef FFT5D_FFTW_THREADS
413 FFTW(init_threads
)();
419 nthreads
= omp_get_num_threads();
422 if (prank
[0] == 0 && prank
[1] == 0)
424 printf("Running fftw on %d threads\n",nthreads
);
426 FFTW(plan_with_nthreads
)(nthreads
);
430 #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but insead only 1D plans */
431 if ((!(flags
&FFT5D_INPLACE
)) && (!(P
[0]>1 || P
[1]>1))) { /*don't do 3d plan in parallel or if in_place requested */
432 int fftwflags
=FFTW_DESTROY_INPUT
;
434 int inNG
=NG
,outMG
=MG
,outKG
=KG
;
437 if (!(flags
&FFT5D_NOMEASURE
)) fftwflags
|=FFTW_MEASURE
;
438 if (flags
&FFT5D_REALCOMPLEX
) {
439 if (!(flags
&FFT5D_BACKWARD
)) { /*input pointer is not complex*/
441 } else { /*output pointer is not complex*/
442 if (!(flags
&FFT5D_ORDER_YZ
)) outMG
*=2;
447 if (!(flags
&FFT5D_BACKWARD
)) {
452 dims
[0].is
= inNG
*MG
; /*N M K*/
455 if (!(flags
&FFT5D_ORDER_YZ
)) {
456 dims
[0].os
= MG
; /*M K N*/
460 dims
[0].os
= 1; /*K N M*/
465 if (!(flags
&FFT5D_ORDER_YZ
)) {
474 dims
[0].os
= outMG
*KG
;
486 dims
[0].os
= outKG
*NG
;
491 if ((flags
&FFT5D_REALCOMPLEX
) && !(flags
&FFT5D_BACKWARD
)) {
492 plan
->p3d
= FFTW(plan_guru_dft_r2c
)(/*rank*/ 3, dims
,
493 /*howmany*/ 0, /*howmany_dims*/0 ,
494 (real
*)lin
, (FFTW(complex) *)lout
,
495 /*flags*/ fftwflags
);
496 } else if ((flags
&FFT5D_REALCOMPLEX
) && (flags
&FFT5D_BACKWARD
)) {
497 plan
->p3d
= FFTW(plan_guru_dft_c2r
)(/*rank*/ 3, dims
,
498 /*howmany*/ 0, /*howmany_dims*/0 ,
499 (FFTW(complex) *)lin
, (real
*)lout
,
500 /*flags*/ fftwflags
);
502 plan
->p3d
= FFTW(plan_guru_dft
)(/*rank*/ 3, dims
,
503 /*howmany*/ 0, /*howmany_dims*/0 ,
504 (FFTW(complex) *)lin
, (FFTW(complex) *)lout
,
505 /*sign*/ (flags
&FFT5D_BACKWARD
)?1:-1, /*flags*/ fftwflags
);
509 if (!plan
->p3d
) { /* for decomposition and if 3d plan did not work */
510 #endif /* GMX_FFT_FFTW3 */
514 fprintf(debug
,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
515 s
,rC
[s
],M
[s
],pK
[s
],C
[s
],lsize
);
517 if ((flags
&FFT5D_REALCOMPLEX
) && ((!(flags
&FFT5D_BACKWARD
) && s
==0) || ((flags
&FFT5D_BACKWARD
) && s
==2))) {
518 gmx_fft_init_many_1d_real( &plan
->p1d
[s
], rC
[s
], pM
[s
]*pK
[s
], (flags
&FFT5D_NOMEASURE
)?GMX_FFT_FLAG_CONSERVATIVE
:0 );
520 gmx_fft_init_many_1d ( &plan
->p1d
[s
], C
[s
], pM
[s
]*pK
[s
], (flags
&FFT5D_NOMEASURE
)?GMX_FFT_FLAG_CONSERVATIVE
:0 );
526 if ((flags
&FFT5D_ORDER_YZ
)) { /*plan->cart is in the order of transposes */
527 plan
->cart
[0]=comm
[0]; plan
->cart
[1]=comm
[1];
529 plan
->cart
[1]=comm
[0]; plan
->cart
[0]=comm
[1];
531 #ifdef FFT5D_MPI_TRANSPOSE
534 if ((s
==0 && !(flags
&FFT5D_ORDER_YZ
)) || (s
==1 && (flags
&FFT5D_ORDER_YZ
)))
535 plan
->mpip
[s
] = FFTW(mpi_plan_many_transpose
)(nP
[s
], nP
[s
], N
[s
]*K
[s
]*pM
[s
]*2, 1, 1, (real
*)lin
, (real
*)lout
, plan
->cart
[s
], FFTW_PATIENT
);
537 plan
->mpip
[s
] = FFTW(mpi_plan_many_transpose
)(nP
[s
], nP
[s
], N
[s
]*pK
[s
]*M
[s
]*2, 1, 1, (real
*)lin
, (real
*)lout
, plan
->cart
[s
], FFTW_PATIENT
);
546 plan
->NG
=NG
;plan
->MG
=MG
;plan
->KG
=KG
;
549 plan
->N
[s
]=N
[s
];plan
->M
[s
]=M
[s
];plan
->K
[s
]=K
[s
];plan
->pN
[s
]=pN
[s
];plan
->pM
[s
]=pM
[s
];plan
->pK
[s
]=pK
[s
];
550 plan
->oM
[s
]=oM
[s
];plan
->oK
[s
]=oK
[s
];
551 plan
->C
[s
]=C
[s
];plan
->rC
[s
]=rC
[s
];
552 plan
->iNin
[s
]=iNin
[s
];plan
->oNin
[s
]=oNin
[s
];plan
->iNout
[s
]=iNout
[s
];plan
->oNout
[s
]=oNout
[s
];
555 plan
->P
[s
]=nP
[s
];plan
->coor
[s
]=prank
[s
];
558 /* plan->fftorder=fftorder;
559 plan->direction=direction;
560 plan->realcomplex=realcomplex;
580 /*here x,y,z and N,M,K is in rotated coordinate system!!
581 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
582 maxN,maxM,maxK is max size of local data
583 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
584 NG, MG, KG is size of global data*/
585 static void splitaxes(t_complex
* lout
,const t_complex
* lin
,
586 int maxN
,int maxM
,int maxK
, int pN
, int pM
, int pK
,
587 int P
,int NG
,int *N
, int* oN
)
590 int in_i
,out_i
,in_z
,out_z
,in_y
,out_y
;
595 /* In the thread parallel case we want to loop over z and i
596 * in a single for loop to allow for better load balancing.
598 #pragma omp parallel for private(z,in_z,out_z,i,in_i,out_i,y,in_y,out_y,x) schedule(static)
599 for (zi
=0; zi
<pK
*P
; zi
++)
604 for (z
=0; z
<pK
; z
++) /*3. z l*/
610 #ifndef FFT5D_THREADS
611 for (i
=0; i
<P
; i
++) /*index cube along long axis*/
614 in_i
= in_z
+ i
*maxN
*maxM
*maxK
;
615 out_i
= out_z
+ oN
[i
];
616 for (y
=0;y
<pM
;y
++) { /*2. y k*/
617 in_y
= in_i
+ y
*maxN
;
618 out_y
= out_i
+ y
*NG
;
619 for (x
=0;x
<N
[i
];x
++) { /*1. x j*/
620 lout
[in_y
+x
] = lin
[out_y
+x
];
621 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
622 /*before split data contiguos - thus if different processor get different amount oN is different*/
629 /*make axis contiguous again (after AllToAll) and also do local transpose*/
630 /*transpose mayor and major dimension
632 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
633 N,M,K local dimensions
635 static void joinAxesTrans13(t_complex
* lin
,const t_complex
* lout
,
636 int maxN
,int maxM
,int maxK
,int pN
, int pM
, int pK
,
637 int P
,int KG
, int* K
, int* oK
)
640 int in_i
,out_i
,in_x
,out_x
,in_z
,out_z
;
645 /* In the thread parallel case we want to loop over x and i
646 * in a single for loop to allow for better load balancing.
648 #pragma omp parallel for private(x,in_x,out_x,i,in_i,out_i,z,in_z,out_z,y) schedule(static)
649 for (xi
=0; xi
<pN
*P
; xi
++)
654 for (x
=0;x
<pN
;x
++) /*1.j*/
660 #ifndef FFT5D_THREADS
661 for (i
=0;i
<P
;i
++) /*index cube along long axis*/
665 out_i
= out_x
+ i
*maxM
*maxN
*maxK
;
666 for (z
=0;z
<K
[i
];z
++) /*3.l*/
669 out_z
= out_i
+ z
*maxM
*maxN
;
670 for (y
=0;y
<pM
;y
++) { /*2.k*/
671 lin
[in_z
+y
*KG
] = lout
[out_z
+y
*maxN
];
678 /*make axis contiguous again (after AllToAll) and also do local transpose
679 tranpose mayor and middle dimension
681 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
684 static void joinAxesTrans12(t_complex
* lin
,const t_complex
* lout
,int maxN
,int maxM
,int maxK
,int pN
, int pM
, int pK
,
685 int P
,int MG
, int* M
, int* oM
) {
687 int in_i
,out_i
,in_z
,out_z
,in_x
,out_x
;
692 /* In the thread parallel case we want to loop over z and i
693 * in a single for loop to allow for better load balancing.
695 #pragma omp parallel for private(i,in_i,out_i,z,in_z,out_z,in_x,out_x,x,y) schedule(static)
696 for (zi
=0; zi
<pK
*P
; zi
++)
707 #ifndef FFT5D_THREADS
708 for (i
=0; i
<P
; i
++) /*index cube along long axis*/
712 out_i
= out_z
+ i
*maxM
*maxN
*maxK
;
716 for (y
=0;y
<M
[i
];y
++) {
717 lin
[in_x
+y
] = lout
[out_x
+y
*maxN
];
725 static void rotate(int x
[]) {
735 /*compute the offset to compare or print transposed local data in original input coordinates
736 xs matrix dimension size, xl dimension length, xc decomposition offset
737 s: step in computation = number of transposes*/
738 static void compute_offsets(fft5d_plan plan
, int xs
[], int xl
[], int xc
[], int NG
[], int s
) {
739 /* int direction = plan->direction;
740 int fftorder = plan->fftorder;*/
744 int *pM
=plan
->pM
, *pK
=plan
->pK
, *oM
=plan
->oM
, *oK
=plan
->oK
,
745 *C
=plan
->C
, *rC
=plan
->rC
;
747 NG
[0]=plan
->NG
;NG
[1]=plan
->MG
;NG
[2]=plan
->KG
;
749 if (!(plan
->flags
&FFT5D_ORDER_YZ
)) {
751 case 0: o
=XYZ
; break;
752 case 1: o
=ZYX
; break;
753 case 2: o
=YZX
; break;
758 case 0: o
=XYZ
; break;
759 case 1: o
=YXZ
; break;
760 case 2: o
=ZXY
; break;
766 case XYZ
:pos
[0]=1;pos
[1]=2;pos
[2]=3;break;
767 case XZY
:pos
[0]=1;pos
[1]=3;pos
[2]=2;break;
768 case YXZ
:pos
[0]=2;pos
[1]=1;pos
[2]=3;break;
769 case YZX
:pos
[0]=3;pos
[1]=1;pos
[2]=2;break;
770 case ZXY
:pos
[0]=2;pos
[1]=3;pos
[2]=1;break;
771 case ZYX
:pos
[0]=3;pos
[1]=2;pos
[2]=1;break;
773 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
775 /*xs, xl give dimension size and data length in local transposed coordinate system
776 for 0(/1/2): x(/y/z) in original coordinate system*/
779 case 1: xs
[i
]=1; xc
[i
]=0; xl
[i
]=C
[s
];break;
780 case 2: xs
[i
]=C
[s
]; xc
[i
]=oM
[s
]; xl
[i
]=pM
[s
];break;
781 case 3: xs
[i
]=C
[s
]*pM
[s
];xc
[i
]=oK
[s
]; xl
[i
]=pK
[s
];break;
784 /*input order is different for test program to match FFTW order
785 (important for complex to real)*/
786 if (plan
->flags
&FFT5D_BACKWARD
) {
791 if (plan
->flags
&FFT5D_ORDER_YZ
) {
798 if (plan
->flags
&FFT5D_REALCOMPLEX
&& ((!(plan
->flags
&FFT5D_BACKWARD
) && s
==0) || (plan
->flags
&FFT5D_BACKWARD
&& s
==2))) {
803 static void print_localdata(const t_complex
* lin
, const char* txt
, int s
, fft5d_plan plan
) {
805 int *coor
= plan
->coor
;
806 int xs
[3],xl
[3],xc
[3],NG
[3];
807 int ll
=(plan
->flags
&FFT5D_REALCOMPLEX
)?1:2;
808 compute_offsets(plan
,xs
,xl
,xc
,NG
,s
);
809 fprintf(debug
,txt
,coor
[0],coor
[1],s
);
810 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
811 for(z
=0;z
<xl
[2];z
++) {
812 for(y
=0;y
<xl
[1];y
++) {
813 fprintf(debug
,"%d %d: ",coor
[0],coor
[1]);
814 for (x
=0;x
<xl
[0];x
++) {
816 fprintf(debug
,"%f ",((real
*)lin
)[(z
*xs
[2]+y
*xs
[1])*2+(x
*xs
[0])*ll
+l
]);
825 void fft5d_execute(fft5d_plan plan
,fft5d_time times
) {
826 t_complex
*lin
= plan
->lin
;
827 t_complex
*lout
= plan
->lout
;
829 gmx_fft_t
*p1d
=plan
->p1d
;
830 #ifdef FFT5D_MPI_TRANSPOSE
831 FFTW(plan
) *mpip
=plan
->mpip
;
834 MPI_Comm
*cart
=plan
->cart
;
837 double time_fft
=0,time_local
=0,time_mpi
[2]={0},time
=0;
838 int *N
=plan
->N
,*M
=plan
->M
,*K
=plan
->K
,*pN
=plan
->pN
,*pM
=plan
->pM
,*pK
=plan
->pK
,
839 *C
=plan
->C
,*P
=plan
->P
,**iNin
=plan
->iNin
,**oNin
=plan
->oNin
,**iNout
=plan
->iNout
,**oNout
=plan
->oNout
;
847 FFTW(execute
)(plan
->p3d
);
849 times
->fft
+=MPI_Wtime()-time
;
855 if (plan
->flags
&FFT5D_DEBUG
) print_localdata(lin
, "%d %d: copy in lin\n", s
, plan
);
860 if ((plan
->flags
&FFT5D_REALCOMPLEX
) && !(plan
->flags
&FFT5D_BACKWARD
) && s
==0) {
861 gmx_fft_many_1d_real(p1d
[s
],(plan
->flags
&FFT5D_BACKWARD
)?GMX_FFT_COMPLEX_TO_REAL
:GMX_FFT_REAL_TO_COMPLEX
,lin
,lout
);
863 gmx_fft_many_1d( p1d
[s
],(plan
->flags
&FFT5D_BACKWARD
)?GMX_FFT_BACKWARD
:GMX_FFT_FORWARD
, lin
,lout
);
866 time_fft
+=MPI_Wtime()-time
;
868 if (plan
->flags
&FFT5D_DEBUG
) print_localdata(lout
, "%d %d: FFT %d\n", s
, plan
);
871 if (GMX_PARALLEL_ENV_INITIALIZED
&& cart
[s
]!=MPI_COMM_NULL
&& P
[s
]>1)
875 /*prepare for AllToAll
876 1. (most outer) axes (x) is split into P[s] parts of size N[s]
878 splitaxes(lin
,lout
,N
[s
],M
[s
],K
[s
], pN
[s
],pM
[s
],pK
[s
],P
[s
],C
[s
],iNout
[s
],oNout
[s
]);
882 time_local
+=MPI_Wtime()-time
;
888 #ifdef FFT5D_MPI_TRANSPOSE
889 FFTW(execute
)(mpip
[s
]);
891 if ((s
==0 && !(plan
->flags
&FFT5D_ORDER_YZ
)) || (s
==1 && (plan
->flags
&FFT5D_ORDER_YZ
)))
892 MPI_Alltoall(lin
,N
[s
]*pM
[s
]*K
[s
]*sizeof(t_complex
)/sizeof(real
),GMX_MPI_REAL
,lout
,N
[s
]*pM
[s
]*K
[s
]*sizeof(t_complex
)/sizeof(real
),GMX_MPI_REAL
,cart
[s
]);
894 MPI_Alltoall(lin
,N
[s
]*M
[s
]*pK
[s
]*sizeof(t_complex
)/sizeof(real
),GMX_MPI_REAL
,lout
,N
[s
]*M
[s
]*pK
[s
]*sizeof(t_complex
)/sizeof(real
),GMX_MPI_REAL
,cart
[s
]);
895 #endif /*FFT5D_MPI_TRANSPOSE*/
897 time_mpi
[s
]=MPI_Wtime()-time
;
904 /*bring back in matrix form
905 thus make new 1. axes contiguos
906 also local transpose 1 and 2/3 */
907 if ((s
==0 && !(plan
->flags
&FFT5D_ORDER_YZ
)) || (s
==1 && (plan
->flags
&FFT5D_ORDER_YZ
)))
908 joinAxesTrans13(lin
,lout
,N
[s
],pM
[s
],K
[s
],pN
[s
],pM
[s
],pK
[s
],P
[s
],C
[s
+1],iNin
[s
+1],oNin
[s
+1]);
910 joinAxesTrans12(lin
,lout
,N
[s
],M
[s
],pK
[s
],pN
[s
],pM
[s
],pK
[s
],P
[s
],C
[s
+1],iNin
[s
+1],oNin
[s
+1]);
912 time_local
+=MPI_Wtime()-time
;
914 if (plan
->flags
&FFT5D_DEBUG
) print_localdata(lin
, "%d %d: tranposed %d\n", s
+1, plan
);
916 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
921 if (plan
->flags
&FFT5D_INPLACE
) lout
=lin
;
922 if ((plan
->flags
&FFT5D_REALCOMPLEX
) && (plan
->flags
&FFT5D_BACKWARD
)) {
923 gmx_fft_many_1d_real(p1d
[s
],(plan
->flags
&FFT5D_BACKWARD
)?GMX_FFT_COMPLEX_TO_REAL
:GMX_FFT_REAL_TO_COMPLEX
,lin
,lout
);
925 gmx_fft_many_1d( p1d
[s
],(plan
->flags
&FFT5D_BACKWARD
)?GMX_FFT_BACKWARD
:GMX_FFT_FORWARD
, lin
,lout
);
929 time_fft
+=MPI_Wtime()-time
;
930 if (plan
->flags
&FFT5D_DEBUG
) print_localdata(lout
, "%d %d: FFT %d\n", s
, plan
);
931 /*if (debug) print_localdata(lout, "%d %d: FFT in y\n", N1, M, K0, YZX, coor);*/
934 times
->fft
+=time_fft
;
935 times
->local
+=time_local
;
936 times
->mpi2
+=time_mpi
[1];
937 times
->mpi1
+=time_mpi
[0];
941 void fft5d_destroy(fft5d_plan plan
) {
944 gmx_many_fft_destroy(plan
->p1d
[s
]);
953 if (plan
->iNout
[s
]) {
954 free(plan
->iNout
[s
]);
957 if (plan
->oNout
[s
]) {
958 free(plan
->oNout
[s
]);
964 #ifdef FFT5D_MPI_TRANSPOS
966 FFTW(destroy_plan
)(plan
->mpip
[s
]);
967 #endif /* FFT5D_MPI_TRANSPOS */
968 #endif /* GMX_FFT_FFTW3 */
970 /*We can't free lin/lout here - is allocated by gmx_calloc_aligned which can't be freed*/
974 #ifdef FFT5D_FFTW_THREADS
975 FFTW(cleanup_threads
)();
982 /*Is this better than direct access of plan? enough data?
983 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
984 void fft5d_local_size(fft5d_plan plan
,int* N1
,int* M0
,int* K0
,int* K1
,int** coor
) {
994 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
995 of processor dimensions*/
996 fft5d_plan
fft5d_plan_3d_cart(int NG
, int MG
, int KG
, MPI_Comm comm
, int P0
, int flags
, t_complex
** rlin
, t_complex
** rlout
) {
997 MPI_Comm cart
[2]={0};
1004 int rdim1
[] = {0,1}, rdim2
[] = {1,0};
1006 MPI_Comm_size(comm
,&size
);
1007 MPI_Comm_rank(comm
,&prank
);
1009 if (P0
==0) P0
= lfactor(size
);
1011 if (prank
==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size
,P0
);
1015 P
[0] = P0
; P
[1]=size
/P0
; /*number of processors in the two dimensions*/
1017 /*Difference between x-y-z regarding 2d decomposition is whether they are
1018 distributed along axis 1, 2 or both*/
1020 MPI_Cart_create(comm
,2,P
,wrap
,1,&gcart
); /*parameter 4: value 1: reorder*/
1021 MPI_Cart_get(gcart
,2,P
,wrap
,coor
);
1022 MPI_Cart_sub(gcart
, rdim1
, &cart
[0]);
1023 MPI_Cart_sub(gcart
, rdim2
, &cart
[1]);
1025 return fft5d_plan_3d(NG
, MG
, KG
, cart
, flags
, rlin
, rlout
);
1030 /*prints in original coordinate system of data (as the input to FFT)*/
1031 void fft5d_compare_data(const t_complex
* lin
, const t_complex
* in
, fft5d_plan plan
, int bothLocal
, int normalize
) {
1032 int xs
[3],xl
[3],xc
[3],NG
[3];
1034 int *coor
= plan
->coor
;
1035 int ll
=2; /*compare ll values per element (has to be 2 for complex)*/
1036 if (plan
->flags
&FFT5D_REALCOMPLEX
&& plan
->flags
&FFT5D_BACKWARD
)
1041 compute_offsets(plan
,xs
,xl
,xc
,NG
,2);
1042 if (plan
->flags
&FFT5D_DEBUG
) printf("Compare2\n");
1043 for (z
=0;z
<xl
[2];z
++) {
1044 for(y
=0;y
<xl
[1];y
++) {
1045 if (plan
->flags
&FFT5D_DEBUG
) printf("%d %d: ",coor
[0],coor
[1]);
1046 for (x
=0;x
<xl
[0];x
++) {
1047 for (l
=0;l
<ll
;l
++) { /*loop over real/complex parts*/
1049 a
=((real
*)lin
)[(z
*xs
[2]+y
*xs
[1])*2+x
*xs
[0]*ll
+l
];
1050 if (normalize
) a
/=plan
->rC
[0]*plan
->rC
[1]*plan
->rC
[2];
1052 b
=((real
*)in
)[((z
+xc
[2])*NG
[0]*NG
[1]+(y
+xc
[1])*NG
[0])*2+(x
+xc
[0])*ll
+l
];
1054 b
=((real
*)in
)[(z
*xs
[2]+y
*xs
[1])*2+x
*xs
[0]*ll
+l
];
1055 if (plan
->flags
&FFT5D_DEBUG
) {
1056 printf("%f %f, ",a
,b
);
1058 if (fabs(a
-b
)>2*NG
[0]*NG
[1]*NG
[2]*GMX_REAL_EPS
) {
1059 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor
[0],coor
[1],x
,y
,z
,a
,b
);
1061 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1064 if (plan
->flags
&FFT5D_DEBUG
) printf(",");
1066 if (plan
->flags
&FFT5D_DEBUG
) printf("\n");