1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2 * $Id: gmx_matrix.c,v 1.4 2008/12/02 18:27:57 spoel Exp $
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2008, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * Groningen Machine for Chemical Simulation
46 #define GMX_PARALLEL_ENV_INITIALIZED 1
49 #define GMX_PARALLEL_ENV_INITIALIZED gmx_parallel_env_initialized()
61 /* requires fftw compiled with openmp */
62 #define FFT5D_FFTW_THREADS
70 #ifndef __FLT_EPSILON__
71 #define __FLT_EPSILON__ FLT_EPSILON
72 #define __DBL_EPSILON__ DBL_EPSILON
79 #include "gmx_fatal.h"
84 /* none of the fftw3 calls, except execute(), are thread-safe, so
85 we need to serialize them with this mutex. */
86 static tMPI_Thread_mutex_t big_fftw_mutex
=TMPI_THREAD_MUTEX_INITIALIZER
;
88 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex);
89 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex);
90 #else /* GMX_THREADS */
93 #endif /* GMX_THREADS */
94 #endif /* GMX_FFT_FFTW3 */
96 static double fft5d_fmax(double a
, double b
){
100 /* largest factor smaller than sqrt */
101 static int lfactor(int z
) {
104 if (z
%i
==0) return i
;
108 static int l2factor(int z
) {
112 if (z
%i
==0) return i
;
115 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
116 static int lpfactor(int z
) {
119 return fft5d_fmax(lpfactor(f
),lpfactor(z
/f
));
123 #ifdef HAVE_GETTIMEOFDAY
124 #include <sys/time.h>
128 return tv
.tv_sec
+tv
.tv_usec
*1e-6;
137 static int vmax(int* a
, int s
) {
141 if (a
[i
]>max
) max
=a
[i
];
147 copied here from fftgrid, because:
148 1. function there not publically available
149 2. not sure whether we keep fftgrid
150 3. less dependencies for fft5d
152 Only used for non-fftw case
155 gmx_calloc_aligned(size_t size
)
159 /*We initialize by zero for Valgrind
160 For non-divisible case we communicate more than the data.
161 If we don't initialize the data we communicate uninitialized data*/
162 p0
= calloc(size
+32,1);
166 gmx_fatal(FARGS
,"Failed to allocated %u bytes of aligned memory.",size
+32);
169 p
= (void *) (((size_t) p0
+ 32) & (~((size_t) 31)));
171 /* Yeah, yeah, we cannot free this pointer, but who cares... */
176 /* NxMxK the size of the data
177 * comm communicator to use for fft5d
178 * P0 number of processor in 1st axes (can be null for automatic)
179 * lin is allocated by fft5d because size of array is only known after planning phase */
180 fft5d_plan
fft5d_plan_3d(int NG
, int MG
, int KG
, MPI_Comm comm
[2], int flags
, t_complex
** rlin
, t_complex
** rlout
)
183 int P
[2],bMaster
,prank
[2],i
;
185 int *N0
=0, *N1
=0, *M0
=0, *M1
=0, *K0
=0, *K1
=0, *oN0
=0, *oN1
=0, *oM0
=0, *oM1
=0, *oK0
=0, *oK1
=0;
186 int N
[3],M
[3],K
[3],pN
[3],pM
[3],pK
[3],oM
[3],oK
[3],*iNin
[3]={0},*oNin
[3]={0},*iNout
[3]={0},*oNout
[3]={0};
187 int C
[3],rC
[3],nP
[2];
189 t_complex
*lin
=0,*lout
=0;
193 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
195 if (GMX_PARALLEL_ENV_INITIALIZED
&& comm
[0] != 0)
197 MPI_Comm_size(comm
[0],&P
[0]);
198 MPI_Comm_rank(comm
[0],&prank
[0]);
207 if (GMX_PARALLEL_ENV_INITIALIZED
&& comm
[1] != 0)
209 MPI_Comm_size(comm
[1],&P
[1]);
210 MPI_Comm_rank(comm
[1],&prank
[1]);
219 bMaster
=(prank
[0]==0&&prank
[1]==0);
224 fprintf(debug
,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
225 P
[0],P
[1],prank
[0],prank
[1]);
230 fprintf(debug
,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
231 NG
,MG
,KG
,P
[0],P
[1],(flags
&FFT5D_REALCOMPLEX
)>0,(flags
&FFT5D_BACKWARD
)>0,(flags
&FFT5D_ORDER_YZ
)>0,(flags
&FFT5D_DEBUG
)>0);
232 /* The check below is not correct, one prime factor 11 or 13 is ok.
233 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
234 printf("WARNING: FFT very slow with prime factors larger 7\n");
235 printf("Change FFT size or in case you cannot change it look at\n");
236 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
241 if (NG
==0 || MG
==0 || KG
==0) {
242 if (bMaster
) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
246 rNG
=NG
;rMG
=MG
;rKG
=KG
;
248 if (flags
&FFT5D_REALCOMPLEX
) {
249 if (!(flags
&FFT5D_BACKWARD
)) NG
= NG
/2+1;
251 if (!(flags
&FFT5D_ORDER_YZ
)) MG
=MG
/2+1;
257 /*for transpose we need to know the size for each processor not only our own size*/
259 N0
= (int*)malloc(P
[0]*sizeof(int)); N1
= (int*)malloc(P
[1]*sizeof(int));
260 M0
= (int*)malloc(P
[0]*sizeof(int)); M1
= (int*)malloc(P
[1]*sizeof(int));
261 K0
= (int*)malloc(P
[0]*sizeof(int)); K1
= (int*)malloc(P
[1]*sizeof(int));
262 oN0
= (int*)malloc(P
[0]*sizeof(int));oN1
= (int*)malloc(P
[1]*sizeof(int));
263 oM0
= (int*)malloc(P
[0]*sizeof(int));oM1
= (int*)malloc(P
[1]*sizeof(int));
264 oK0
= (int*)malloc(P
[0]*sizeof(int));oK1
= (int*)malloc(P
[1]*sizeof(int));
270 oN0
[i
]=i
*ceil((double)NG
/P
[0]);
271 oM0
[i
]=i
*ceil((double)MG
/P
[0]);
272 oK0
[i
]=i
*ceil((double)KG
/P
[0]);
282 oN1
[i
]=i
*ceil((double)NG
/P
[1]);
283 oM1
[i
]=i
*ceil((double)MG
/P
[1]);
284 oK1
[i
]=i
*ceil((double)KG
/P
[1]);
291 for (i
=0;i
<P
[0]-1;i
++)
293 N0
[i
]=oN0
[i
+1]-oN0
[i
];
294 M0
[i
]=oM0
[i
+1]-oM0
[i
];
295 K0
[i
]=oK0
[i
+1]-oK0
[i
];
297 N0
[P
[0]-1]=NG
-oN0
[P
[0]-1];
298 M0
[P
[0]-1]=MG
-oM0
[P
[0]-1];
299 K0
[P
[0]-1]=KG
-oK0
[P
[0]-1];
300 for (i
=0;i
<P
[1]-1;i
++)
302 N1
[i
]=oN1
[i
+1]-oN1
[i
];
303 M1
[i
]=oM1
[i
+1]-oM1
[i
];
304 K1
[i
]=oK1
[i
+1]-oK1
[i
];
306 N1
[P
[1]-1]=NG
-oN1
[P
[1]-1];
307 M1
[P
[1]-1]=MG
-oM1
[P
[1]-1];
308 K1
[P
[1]-1]=KG
-oK1
[P
[1]-1];
310 /* for step 1-3 the local N,M,K sizes of the transposed system
311 C: contiguous dimension, and nP: number of processor in subcommunicator
315 pM
[0] = M0
[prank
[0]];
316 oM
[0] = oM0
[prank
[0]];
317 pK
[0] = K1
[prank
[1]];
318 oK
[0] = oK1
[prank
[1]];
321 if (!(flags
&FFT5D_ORDER_YZ
)) {
322 N
[0] = vmax(N1
,P
[1]);
324 K
[0] = vmax(K1
,P
[1]);
325 pN
[0] = N1
[prank
[1]];
331 N
[1] = vmax(K0
,P
[0]);
332 pN
[1] = K0
[prank
[0]];
337 M
[1] = vmax(M0
,P
[0]);
338 pM
[1] = M0
[prank
[0]];
339 oM
[1] = oM0
[prank
[0]];
341 pK
[1] = N1
[prank
[1]];
342 oK
[1] = oN1
[prank
[1]];
348 M
[2] = vmax(K0
,P
[0]);
349 pM
[2] = K0
[prank
[0]];
350 oM
[2] = oK0
[prank
[0]];
351 K
[2] = vmax(N1
,P
[1]);
352 pK
[2] = N1
[prank
[1]];
353 oK
[2] = oN1
[prank
[1]];
354 free(N0
); free(oN0
); /*these are not used for this order*/
355 free(M1
); free(oM1
); /*the rest is freed in destroy*/
357 N
[0] = vmax(N0
,P
[0]);
358 M
[0] = vmax(M0
,P
[0]);
360 pN
[0] = N0
[prank
[0]];
366 N
[1] = vmax(M1
,P
[1]);
367 pN
[1] = M1
[prank
[1]];
373 pM
[1] = N0
[prank
[0]];
374 oM
[1] = oN0
[prank
[0]];
375 K
[1] = vmax(K1
,P
[1]);
376 pK
[1] = K1
[prank
[1]];
377 oK
[1] = oK1
[prank
[1]];
383 M
[2] = vmax(N0
,P
[0]);
384 pM
[2] = N0
[prank
[0]];
385 oM
[2] = oN0
[prank
[0]];
386 K
[2] = vmax(M1
,P
[1]);
387 pK
[2] = M1
[prank
[1]];
388 oK
[2] = oM1
[prank
[1]];
389 free(N1
); free(oN1
); /*these are not used for this order*/
390 free(K0
); free(oK0
); /*the rest is freed in destroy*/
394 Difference between x-y-z regarding 2d decomposition is whether they are
395 distributed along axis 1, 2 or both
398 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
399 lsize
= fft5d_fmax(N
[0]*M
[0]*K
[0]*nP
[0],fft5d_fmax(N
[1]*M
[1]*K
[1]*nP
[1],C
[2]*M
[2]*K
[2]));
400 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
401 if (!(flags
&FFT5D_NOMALLOC
)) {
402 lin
= (t_complex
*)gmx_calloc_aligned(sizeof(t_complex
) * lsize
);
403 lout
= (t_complex
*)gmx_calloc_aligned(sizeof(t_complex
) * lsize
);
409 plan
= (fft5d_plan
)calloc(1,sizeof(struct fft5d_plan_t
));
413 #ifdef FFT5D_FFTW_THREADS
414 FFTW(init_threads
)();
420 nthreads
= omp_get_num_threads();
423 if (prank
[0] == 0 && prank
[1] == 0)
425 printf("Running fftw on %d threads\n",nthreads
);
427 FFTW(plan_with_nthreads
)(nthreads
);
431 #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but insead only 1D plans */
432 if ((!(flags
&FFT5D_INPLACE
)) && (!(P
[0]>1 || P
[1]>1))) { /*don't do 3d plan in parallel or if in_place requested */
433 int fftwflags
=FFTW_DESTROY_INPUT
;
435 int inNG
=NG
,outMG
=MG
,outKG
=KG
;
438 if (!(flags
&FFT5D_NOMEASURE
)) fftwflags
|=FFTW_MEASURE
;
439 if (flags
&FFT5D_REALCOMPLEX
) {
440 if (!(flags
&FFT5D_BACKWARD
)) { /*input pointer is not complex*/
442 } else { /*output pointer is not complex*/
443 if (!(flags
&FFT5D_ORDER_YZ
)) outMG
*=2;
448 if (!(flags
&FFT5D_BACKWARD
)) {
453 dims
[0].is
= inNG
*MG
; /*N M K*/
456 if (!(flags
&FFT5D_ORDER_YZ
)) {
457 dims
[0].os
= MG
; /*M K N*/
461 dims
[0].os
= 1; /*K N M*/
466 if (!(flags
&FFT5D_ORDER_YZ
)) {
475 dims
[0].os
= outMG
*KG
;
487 dims
[0].os
= outKG
*NG
;
492 if ((flags
&FFT5D_REALCOMPLEX
) && !(flags
&FFT5D_BACKWARD
)) {
493 plan
->p3d
= FFTW(plan_guru_dft_r2c
)(/*rank*/ 3, dims
,
494 /*howmany*/ 0, /*howmany_dims*/0 ,
495 (real
*)lin
, (FFTW(complex) *)lout
,
496 /*flags*/ fftwflags
);
497 } else if ((flags
&FFT5D_REALCOMPLEX
) && (flags
&FFT5D_BACKWARD
)) {
498 plan
->p3d
= FFTW(plan_guru_dft_c2r
)(/*rank*/ 3, dims
,
499 /*howmany*/ 0, /*howmany_dims*/0 ,
500 (FFTW(complex) *)lin
, (real
*)lout
,
501 /*flags*/ fftwflags
);
503 plan
->p3d
= FFTW(plan_guru_dft
)(/*rank*/ 3, dims
,
504 /*howmany*/ 0, /*howmany_dims*/0 ,
505 (FFTW(complex) *)lin
, (FFTW(complex) *)lout
,
506 /*sign*/ (flags
&FFT5D_BACKWARD
)?1:-1, /*flags*/ fftwflags
);
510 if (!plan
->p3d
) { /* for decomposition and if 3d plan did not work */
511 #endif /* GMX_FFT_FFTW3 */
515 fprintf(debug
,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
516 s
,rC
[s
],M
[s
],pK
[s
],C
[s
],lsize
);
518 if ((flags
&FFT5D_REALCOMPLEX
) && ((!(flags
&FFT5D_BACKWARD
) && s
==0) || ((flags
&FFT5D_BACKWARD
) && s
==2))) {
519 gmx_fft_init_many_1d_real( &plan
->p1d
[s
], rC
[s
], pM
[s
]*pK
[s
], (flags
&FFT5D_NOMEASURE
)?GMX_FFT_FLAG_CONSERVATIVE
:0 );
521 gmx_fft_init_many_1d ( &plan
->p1d
[s
], C
[s
], pM
[s
]*pK
[s
], (flags
&FFT5D_NOMEASURE
)?GMX_FFT_FLAG_CONSERVATIVE
:0 );
527 if ((flags
&FFT5D_ORDER_YZ
)) { /*plan->cart is in the order of transposes */
528 plan
->cart
[0]=comm
[0]; plan
->cart
[1]=comm
[1];
530 plan
->cart
[1]=comm
[0]; plan
->cart
[0]=comm
[1];
532 #ifdef FFT5D_MPI_TRANSPOSE
535 if ((s
==0 && !(flags
&FFT5D_ORDER_YZ
)) || (s
==1 && (flags
&FFT5D_ORDER_YZ
)))
536 plan
->mpip
[s
] = FFTW(mpi_plan_many_transpose
)(nP
[s
], nP
[s
], N
[s
]*K
[s
]*pM
[s
]*2, 1, 1, (real
*)lin
, (real
*)lout
, plan
->cart
[s
], FFTW_PATIENT
);
538 plan
->mpip
[s
] = FFTW(mpi_plan_many_transpose
)(nP
[s
], nP
[s
], N
[s
]*pK
[s
]*M
[s
]*2, 1, 1, (real
*)lin
, (real
*)lout
, plan
->cart
[s
], FFTW_PATIENT
);
547 plan
->NG
=NG
;plan
->MG
=MG
;plan
->KG
=KG
;
550 plan
->N
[s
]=N
[s
];plan
->M
[s
]=M
[s
];plan
->K
[s
]=K
[s
];plan
->pN
[s
]=pN
[s
];plan
->pM
[s
]=pM
[s
];plan
->pK
[s
]=pK
[s
];
551 plan
->oM
[s
]=oM
[s
];plan
->oK
[s
]=oK
[s
];
552 plan
->C
[s
]=C
[s
];plan
->rC
[s
]=rC
[s
];
553 plan
->iNin
[s
]=iNin
[s
];plan
->oNin
[s
]=oNin
[s
];plan
->iNout
[s
]=iNout
[s
];plan
->oNout
[s
]=oNout
[s
];
556 plan
->P
[s
]=nP
[s
];plan
->coor
[s
]=prank
[s
];
559 /* plan->fftorder=fftorder;
560 plan->direction=direction;
561 plan->realcomplex=realcomplex;
581 /*here x,y,z and N,M,K is in rotated coordinate system!!
582 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
583 maxN,maxM,maxK is max size of local data
584 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
585 NG, MG, KG is size of global data*/
586 static void splitaxes(t_complex
* lout
,const t_complex
* lin
,
587 int maxN
,int maxM
,int maxK
, int pN
, int pM
, int pK
,
588 int P
,int NG
,int *N
, int* oN
) {
590 int in_i
,out_i
,in_z
,out_z
,in_y
,out_y
;
592 for (i
=0;i
<P
;i
++) { /*index cube along long axis*/
593 in_i
= i
*maxN
*maxM
*maxK
;
596 #pragma omp parallel for private(in_z,out_z,y,in_y,out_y,x)
598 for (z
=0;z
<pK
;z
++) { /*3. z l*/
599 in_z
= in_i
+ z
*maxN
*maxM
;
600 out_z
= out_i
+ z
*NG
*pM
;
601 for (y
=0;y
<pM
;y
++) { /*2. y k*/
602 in_y
= in_z
+ y
*maxN
;
603 out_y
= out_z
+ y
*NG
;
604 for (x
=0;x
<N
[i
];x
++) { /*1. x j*/
605 lout
[in_y
+x
] = lin
[out_y
+x
];
606 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
607 /*before split data contiguos - thus if different processor get different amount oN is different*/
614 /*make axis contiguous again (after AllToAll) and also do local transpose*/
615 /*transpose mayor and major dimension
617 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
618 N,M,K local dimensions
620 static void joinAxesTrans13(t_complex
* lin
,const t_complex
* lout
,
621 int maxN
,int maxM
,int maxK
,int pN
, int pM
, int pK
,
622 int P
,int KG
, int* K
, int* oK
)
625 int in_i
,out_i
,in_x
,out_x
,in_z
,out_z
;
627 for (i
=0;i
<P
;i
++) { /*index cube along long axis*/
629 out_i
= i
*maxM
*maxN
*maxK
;
631 #pragma omp parallel for private(in_x,out_x,z,in_z,out_z,y)
633 for (x
=0;x
<pN
;x
++) { /*1.j*/
634 in_x
= in_i
+ x
*KG
*pM
;
636 for (z
=0;z
<K
[i
];z
++) { /*3.l*/
638 out_z
= out_x
+ z
*maxM
*maxN
;
639 for (y
=0;y
<pM
;y
++) { /*2.k*/
640 lin
[in_z
+y
*KG
] = lout
[out_z
+y
*maxN
];
647 /*make axis contiguous again (after AllToAll) and also do local transpose
648 tranpose mayor and middle dimension
650 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
653 static void joinAxesTrans12(t_complex
* lin
,const t_complex
* lout
,int maxN
,int maxM
,int maxK
,int pN
, int pM
, int pK
,
654 int P
,int MG
, int* M
, int* oM
) {
656 int in_i
,out_i
,in_z
,out_z
,in_x
,out_x
;
658 for (i
=0;i
<P
;i
++) { /*index cube along long axis*/
660 out_i
= i
*maxM
*maxN
*maxK
;
662 #pragma omp parallel for private(in_z,out_z,in_x,out_x,x,y)
665 in_z
= in_i
+ z
*MG
*pN
;
666 out_z
= out_i
+ z
*maxM
*maxN
;
670 for (y
=0;y
<M
[i
];y
++) {
671 lin
[in_x
+y
] = lout
[out_x
+y
*maxN
];
679 static void rotate(int x
[]) {
689 /*compute the offset to compare or print transposed local data in original input coordinates
690 xs matrix dimension size, xl dimension length, xc decomposition offset
691 s: step in computation = number of transposes*/
692 static void compute_offsets(fft5d_plan plan
, int xs
[], int xl
[], int xc
[], int NG
[], int s
) {
693 /* int direction = plan->direction;
694 int fftorder = plan->fftorder;*/
698 int *pM
=plan
->pM
, *pK
=plan
->pK
, *oM
=plan
->oM
, *oK
=plan
->oK
,
699 *C
=plan
->C
, *rC
=plan
->rC
;
701 NG
[0]=plan
->NG
;NG
[1]=plan
->MG
;NG
[2]=plan
->KG
;
703 if (!(plan
->flags
&FFT5D_ORDER_YZ
)) {
705 case 0: o
=XYZ
; break;
706 case 1: o
=ZYX
; break;
707 case 2: o
=YZX
; break;
712 case 0: o
=XYZ
; break;
713 case 1: o
=YXZ
; break;
714 case 2: o
=ZXY
; break;
720 case XYZ
:pos
[0]=1;pos
[1]=2;pos
[2]=3;break;
721 case XZY
:pos
[0]=1;pos
[1]=3;pos
[2]=2;break;
722 case YXZ
:pos
[0]=2;pos
[1]=1;pos
[2]=3;break;
723 case YZX
:pos
[0]=3;pos
[1]=1;pos
[2]=2;break;
724 case ZXY
:pos
[0]=2;pos
[1]=3;pos
[2]=1;break;
725 case ZYX
:pos
[0]=3;pos
[1]=2;pos
[2]=1;break;
727 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
729 /*xs, xl give dimension size and data length in local transposed coordinate system
730 for 0(/1/2): x(/y/z) in original coordinate system*/
733 case 1: xs
[i
]=1; xc
[i
]=0; xl
[i
]=C
[s
];break;
734 case 2: xs
[i
]=C
[s
]; xc
[i
]=oM
[s
]; xl
[i
]=pM
[s
];break;
735 case 3: xs
[i
]=C
[s
]*pM
[s
];xc
[i
]=oK
[s
]; xl
[i
]=pK
[s
];break;
738 /*input order is different for test program to match FFTW order
739 (important for complex to real)*/
740 if (plan
->flags
&FFT5D_BACKWARD
) {
745 if (plan
->flags
&FFT5D_ORDER_YZ
) {
752 if (plan
->flags
&FFT5D_REALCOMPLEX
&& ((!(plan
->flags
&FFT5D_BACKWARD
) && s
==0) || (plan
->flags
&FFT5D_BACKWARD
&& s
==2))) {
757 static void print_localdata(const t_complex
* lin
, const char* txt
, int s
, fft5d_plan plan
) {
759 int *coor
= plan
->coor
;
760 int xs
[3],xl
[3],xc
[3],NG
[3];
761 int ll
=(plan
->flags
&FFT5D_REALCOMPLEX
)?1:2;
762 compute_offsets(plan
,xs
,xl
,xc
,NG
,s
);
763 fprintf(debug
,txt
,coor
[0],coor
[1],s
);
764 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
765 for(z
=0;z
<xl
[2];z
++) {
766 for(y
=0;y
<xl
[1];y
++) {
767 fprintf(debug
,"%d %d: ",coor
[0],coor
[1]);
768 for (x
=0;x
<xl
[0];x
++) {
770 fprintf(debug
,"%f ",((real
*)lin
)[(z
*xs
[2]+y
*xs
[1])*2+(x
*xs
[0])*ll
+l
]);
779 void fft5d_execute(fft5d_plan plan
,fft5d_time times
) {
780 t_complex
*lin
= plan
->lin
;
781 t_complex
*lout
= plan
->lout
;
783 gmx_fft_t
*p1d
=plan
->p1d
;
784 #ifdef FFT5D_MPI_TRANSPOSE
785 FFTW(plan
) *mpip
=plan
->mpip
;
788 MPI_Comm
*cart
=plan
->cart
;
791 double time_fft
=0,time_local
=0,time_mpi
[2]={0},time
=0;
792 int *N
=plan
->N
,*M
=plan
->M
,*K
=plan
->K
,*pN
=plan
->pN
,*pM
=plan
->pM
,*pK
=plan
->pK
,
793 *C
=plan
->C
,*P
=plan
->P
,**iNin
=plan
->iNin
,**oNin
=plan
->oNin
,**iNout
=plan
->iNout
,**oNout
=plan
->oNout
;
801 FFTW(execute
)(plan
->p3d
);
803 times
->fft
+=MPI_Wtime()-time
;
809 if (plan
->flags
&FFT5D_DEBUG
) print_localdata(lin
, "%d %d: copy in lin\n", s
, plan
);
814 if ((plan
->flags
&FFT5D_REALCOMPLEX
) && !(plan
->flags
&FFT5D_BACKWARD
) && s
==0) {
815 gmx_fft_many_1d_real(p1d
[s
],(plan
->flags
&FFT5D_BACKWARD
)?GMX_FFT_COMPLEX_TO_REAL
:GMX_FFT_REAL_TO_COMPLEX
,lin
,lout
);
817 gmx_fft_many_1d( p1d
[s
],(plan
->flags
&FFT5D_BACKWARD
)?GMX_FFT_BACKWARD
:GMX_FFT_FORWARD
, lin
,lout
);
820 time_fft
+=MPI_Wtime()-time
;
822 if (plan
->flags
&FFT5D_DEBUG
) print_localdata(lout
, "%d %d: FFT %d\n", s
, plan
);
825 if (GMX_PARALLEL_ENV_INITIALIZED
&& cart
[s
] !=0 && P
[s
]>1 )
829 /*prepare for AllToAll
830 1. (most outer) axes (x) is split into P[s] parts of size N[s]
832 splitaxes(lin
,lout
,N
[s
],M
[s
],K
[s
], pN
[s
],pM
[s
],pK
[s
],P
[s
],C
[s
],iNout
[s
],oNout
[s
]);
836 time_local
+=MPI_Wtime()-time
;
842 #ifdef FFT5D_MPI_TRANSPOSE
843 FFTW(execute
)(mpip
[s
]);
845 if ((s
==0 && !(plan
->flags
&FFT5D_ORDER_YZ
)) || (s
==1 && (plan
->flags
&FFT5D_ORDER_YZ
)))
846 MPI_Alltoall(lin
,N
[s
]*pM
[s
]*K
[s
]*sizeof(t_complex
)/sizeof(real
),GMX_MPI_REAL
,lout
,N
[s
]*pM
[s
]*K
[s
]*sizeof(t_complex
)/sizeof(real
),GMX_MPI_REAL
,cart
[s
]);
848 MPI_Alltoall(lin
,N
[s
]*M
[s
]*pK
[s
]*sizeof(t_complex
)/sizeof(real
),GMX_MPI_REAL
,lout
,N
[s
]*M
[s
]*pK
[s
]*sizeof(t_complex
)/sizeof(real
),GMX_MPI_REAL
,cart
[s
]);
849 #endif /*FFT5D_MPI_TRANSPOSE*/
851 time_mpi
[s
]=MPI_Wtime()-time
;
858 /*bring back in matrix form
859 thus make new 1. axes contiguos
860 also local transpose 1 and 2/3 */
861 if ((s
==0 && !(plan
->flags
&FFT5D_ORDER_YZ
)) || (s
==1 && (plan
->flags
&FFT5D_ORDER_YZ
)))
862 joinAxesTrans13(lin
,lout
,N
[s
],pM
[s
],K
[s
],pN
[s
],pM
[s
],pK
[s
],P
[s
],C
[s
+1],iNin
[s
+1],oNin
[s
+1]);
864 joinAxesTrans12(lin
,lout
,N
[s
],M
[s
],pK
[s
],pN
[s
],pM
[s
],pK
[s
],P
[s
],C
[s
+1],iNin
[s
+1],oNin
[s
+1]);
866 time_local
+=MPI_Wtime()-time
;
868 if (plan
->flags
&FFT5D_DEBUG
) print_localdata(lin
, "%d %d: tranposed %d\n", s
+1, plan
);
870 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
875 if (plan
->flags
&FFT5D_INPLACE
) lout
=lin
;
876 if ((plan
->flags
&FFT5D_REALCOMPLEX
) && (plan
->flags
&FFT5D_BACKWARD
)) {
877 gmx_fft_many_1d_real(p1d
[s
],(plan
->flags
&FFT5D_BACKWARD
)?GMX_FFT_COMPLEX_TO_REAL
:GMX_FFT_REAL_TO_COMPLEX
,lin
,lout
);
879 gmx_fft_many_1d( p1d
[s
],(plan
->flags
&FFT5D_BACKWARD
)?GMX_FFT_BACKWARD
:GMX_FFT_FORWARD
, lin
,lout
);
883 time_fft
+=MPI_Wtime()-time
;
884 if (plan
->flags
&FFT5D_DEBUG
) print_localdata(lout
, "%d %d: FFT %d\n", s
, plan
);
885 /*if (debug) print_localdata(lout, "%d %d: FFT in y\n", N1, M, K0, YZX, coor);*/
888 times
->fft
+=time_fft
;
889 times
->local
+=time_local
;
890 times
->mpi2
+=time_mpi
[1];
891 times
->mpi1
+=time_mpi
[0];
895 void fft5d_destroy(fft5d_plan plan
) {
898 gmx_many_fft_destroy(plan
->p1d
[s
]);
907 if (plan
->iNout
[s
]) {
908 free(plan
->iNout
[s
]);
911 if (plan
->oNout
[s
]) {
912 free(plan
->oNout
[s
]);
918 #ifdef FFT5D_MPI_TRANSPOS
920 FFTW(destroy_plan
)(plan
->mpip
[s
]);
921 #endif /* FFT5D_MPI_TRANSPOS */
922 #endif /* GMX_FFT_FFTW3 */
924 /*We can't free lin/lout here - is allocated by gmx_calloc_aligned which can't be freed*/
928 #ifdef FFT5D_FFTW_THREADS
929 FFTW(cleanup_threads
)();
936 /*Is this better than direct access of plan? enough data?
937 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
938 void fft5d_local_size(fft5d_plan plan
,int* N1
,int* M0
,int* K0
,int* K1
,int** coor
) {
948 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
949 of processor dimensions*/
950 fft5d_plan
fft5d_plan_3d_cart(int NG
, int MG
, int KG
, MPI_Comm comm
, int P0
, int flags
, t_complex
** rlin
, t_complex
** rlout
) {
951 MPI_Comm cart
[2]={0};
958 int rdim1
[] = {0,1}, rdim2
[] = {1,0};
960 MPI_Comm_size(comm
,&size
);
961 MPI_Comm_rank(comm
,&prank
);
963 if (P0
==0) P0
= lfactor(size
);
965 if (prank
==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size
,P0
);
969 P
[0] = P0
; P
[1]=size
/P0
; /*number of processors in the two dimensions*/
971 /*Difference between x-y-z regarding 2d decomposition is whether they are
972 distributed along axis 1, 2 or both*/
974 MPI_Cart_create(comm
,2,P
,wrap
,1,&gcart
); /*parameter 4: value 1: reorder*/
975 MPI_Cart_get(gcart
,2,P
,wrap
,coor
);
976 MPI_Cart_sub(gcart
, rdim1
, &cart
[0]);
977 MPI_Cart_sub(gcart
, rdim2
, &cart
[1]);
979 return fft5d_plan_3d(NG
, MG
, KG
, cart
, flags
, rlin
, rlout
);
984 /*prints in original coordinate system of data (as the input to FFT)*/
985 void fft5d_compare_data(const t_complex
* lin
, const t_complex
* in
, fft5d_plan plan
, int bothLocal
, int normalize
) {
986 int xs
[3],xl
[3],xc
[3],NG
[3];
988 int *coor
= plan
->coor
;
989 int ll
=2; /*compare ll values per element (has to be 2 for complex)*/
990 if (plan
->flags
&FFT5D_REALCOMPLEX
&& plan
->flags
&FFT5D_BACKWARD
)
995 compute_offsets(plan
,xs
,xl
,xc
,NG
,2);
996 if (plan
->flags
&FFT5D_DEBUG
) printf("Compare2\n");
997 for (z
=0;z
<xl
[2];z
++) {
998 for(y
=0;y
<xl
[1];y
++) {
999 if (plan
->flags
&FFT5D_DEBUG
) printf("%d %d: ",coor
[0],coor
[1]);
1000 for (x
=0;x
<xl
[0];x
++) {
1001 for (l
=0;l
<ll
;l
++) { /*loop over real/complex parts*/
1003 a
=((real
*)lin
)[(z
*xs
[2]+y
*xs
[1])*2+x
*xs
[0]*ll
+l
];
1004 if (normalize
) a
/=plan
->rC
[0]*plan
->rC
[1]*plan
->rC
[2];
1006 b
=((real
*)in
)[((z
+xc
[2])*NG
[0]*NG
[1]+(y
+xc
[1])*NG
[0])*2+(x
+xc
[0])*ll
+l
];
1008 b
=((real
*)in
)[(z
*xs
[2]+y
*xs
[1])*2+x
*xs
[0]*ll
+l
];
1009 if (plan
->flags
&FFT5D_DEBUG
) {
1010 printf("%f %f, ",a
,b
);
1012 if (fabs(a
-b
)>2*NG
[0]*NG
[1]*NG
[2]*GMX_REAL_EPS
) {
1013 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor
[0],coor
[1],x
,y
,z
,a
,b
);
1015 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1018 if (plan
->flags
&FFT5D_DEBUG
) printf(",");
1020 if (plan
->flags
&FFT5D_DEBUG
) printf("\n");