Made sure all source files have some copyright header and include config.h
[gromacs/rigid-bodies.git] / src / mdlib / fft5d.c
blob3248346ab95a855d50ed7d42520aa5eccde56ff9
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2 * $Id: gmx_matrix.c,v 1.4 2008/12/02 18:27:57 spoel Exp $
3 *
4 * This source code is part of
5 *
6 * G R O M A C S
7 *
8 * GROningen MAchine for Chemical Simulations
9 *
10 * VERSION 4.5
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2008, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
33 * And Hey:
34 * Groningen Machine for Chemical Simulation
36 #ifdef HAVE_CONFIG_H
37 #include <config.h>
38 #endif
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
45 #ifdef NOGMX
46 #define GMX_PARALLEL_ENV_INITIALIZED 1
47 #else
48 #include "main.h"
49 #define GMX_PARALLEL_ENV_INITIALIZED gmx_parallel_env_initialized()
50 #endif
52 #ifdef GMX_LIB_MPI
53 #include <mpi.h>
54 #endif
55 #ifdef GMX_THREADS
56 #include "tmpi.h"
57 #endif
59 #ifdef FFT5D_THREADS
60 #include <omp.h>
61 /* requires fftw compiled with openmp */
62 #define FFT5D_FFTW_THREADS
63 #endif
65 #include "fft5d.h"
66 #include <float.h>
67 #include <math.h>
68 #include <assert.h>
70 #ifndef __FLT_EPSILON__
71 #define __FLT_EPSILON__ FLT_EPSILON
72 #define __DBL_EPSILON__ DBL_EPSILON
73 #endif
75 #ifdef NOGMX
76 FILE* debug=0;
77 #endif
79 #include "gmx_fatal.h"
82 #ifdef GMX_FFT_FFTW3
83 #ifdef GMX_THREADS
84 /* none of the fftw3 calls, except execute(), are thread-safe, so
85 we need to serialize them with this mutex. */
86 static tMPI_Thread_mutex_t big_fftw_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
88 #define FFTW_LOCK tMPI_Thread_mutex_lock(&big_fftw_mutex);
89 #define FFTW_UNLOCK tMPI_Thread_mutex_unlock(&big_fftw_mutex);
90 #else /* GMX_THREADS */
91 #define FFTW_LOCK
92 #define FFTW_UNLOCK
93 #endif /* GMX_THREADS */
94 #endif /* GMX_FFT_FFTW3 */
96 static double fft5d_fmax(double a, double b){
97 return (a>b)?a:b;
100 /* largest factor smaller than sqrt */
101 static int lfactor(int z) {
102 int i;
103 for (i=sqrt(z);;i--)
104 if (z%i==0) return i;
107 /* largest factor */
108 static int l2factor(int z) {
109 int i;
110 if (z==1) return 1;
111 for (i=z/2;;i--)
112 if (z%i==0) return i;
115 /* largest prime factor: WARNING: slow recursion, only use for small numbers */
116 static int lpfactor(int z) {
117 int f = l2factor(z);
118 if (f==1) return z;
119 return fft5d_fmax(lpfactor(f),lpfactor(z/f));
122 #ifndef GMX_MPI
123 #ifdef HAVE_GETTIMEOFDAY
124 #include <sys/time.h>
125 double MPI_Wtime() {
126 struct timeval tv;
127 gettimeofday(&tv,0);
128 return tv.tv_sec+tv.tv_usec*1e-6;
130 #else
131 double MPI_Wtime() {
132 return 0.0;
134 #endif
135 #endif
137 static int vmax(int* a, int s) {
138 int i,max=0;
139 for (i=0;i<s;i++)
141 if (a[i]>max) max=a[i];
143 return max;
147 copied here from fftgrid, because:
148 1. function there not publically available
149 2. not sure whether we keep fftgrid
150 3. less dependencies for fft5d
152 Only used for non-fftw case
154 static void *
155 gmx_calloc_aligned(size_t size)
157 void *p0,*p;
159 /*We initialize by zero for Valgrind
160 For non-divisible case we communicate more than the data.
161 If we don't initialize the data we communicate uninitialized data*/
162 p0 = calloc(size+32,1);
164 if(p0 == NULL)
166 gmx_fatal(FARGS,"Failed to allocated %u bytes of aligned memory.",size+32);
169 p = (void *) (((size_t) p0 + 32) & (~((size_t) 31)));
171 /* Yeah, yeah, we cannot free this pointer, but who cares... */
172 return p;
176 /* NxMxK the size of the data
177 * comm communicator to use for fft5d
178 * P0 number of processor in 1st axes (can be null for automatic)
179 * lin is allocated by fft5d because size of array is only known after planning phase */
180 fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout)
183 int P[2],bMaster,prank[2],i;
184 int rNG,rMG,rKG;
185 int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
186 int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
187 int C[3],rC[3],nP[2];
188 int lsize;
189 t_complex *lin=0,*lout=0;
190 fft5d_plan plan;
191 int s;
193 /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
194 #ifdef GMX_MPI
195 if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != 0)
197 MPI_Comm_size(comm[0],&P[0]);
198 MPI_Comm_rank(comm[0],&prank[0]);
200 else
201 #endif
203 P[0] = 1;
204 prank[0] = 0;
206 #ifdef GMX_MPI
207 if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != 0)
209 MPI_Comm_size(comm[1],&P[1]);
210 MPI_Comm_rank(comm[1],&prank[1]);
212 else
213 #endif
215 P[1] = 1;
216 prank[1] = 0;
219 bMaster=(prank[0]==0&&prank[1]==0);
222 if (debug)
224 fprintf(debug,"FFT5D: Using %dx%d processor grid, rank %d,%d\n",
225 P[0],P[1],prank[0],prank[1]);
228 if (bMaster) {
229 if (debug)
230 fprintf(debug,"FFT5D: N: %d, M: %d, K: %d, P: %dx%d, real2complex: %d, backward: %d, order yz: %d, debug %d\n",
231 NG,MG,KG,P[0],P[1],(flags&FFT5D_REALCOMPLEX)>0,(flags&FFT5D_BACKWARD)>0,(flags&FFT5D_ORDER_YZ)>0,(flags&FFT5D_DEBUG)>0);
232 /* The check below is not correct, one prime factor 11 or 13 is ok.
233 if (fft5d_fmax(fft5d_fmax(lpfactor(NG),lpfactor(MG)),lpfactor(KG))>7) {
234 printf("WARNING: FFT very slow with prime factors larger 7\n");
235 printf("Change FFT size or in case you cannot change it look at\n");
236 printf("http://www.fftw.org/fftw3_doc/Generating-your-own-code.html\n");
241 if (NG==0 || MG==0 || KG==0) {
242 if (bMaster) printf("FFT5D: FATAL: Datasize cannot be zero in any dimension\n");
243 return 0;
246 rNG=NG;rMG=MG;rKG=KG;
248 if (flags&FFT5D_REALCOMPLEX) {
249 if (!(flags&FFT5D_BACKWARD)) NG = NG/2+1;
250 else {
251 if (!(flags&FFT5D_ORDER_YZ)) MG=MG/2+1;
252 else KG=KG/2+1;
257 /*for transpose we need to know the size for each processor not only our own size*/
259 N0 = (int*)malloc(P[0]*sizeof(int)); N1 = (int*)malloc(P[1]*sizeof(int));
260 M0 = (int*)malloc(P[0]*sizeof(int)); M1 = (int*)malloc(P[1]*sizeof(int));
261 K0 = (int*)malloc(P[0]*sizeof(int)); K1 = (int*)malloc(P[1]*sizeof(int));
262 oN0 = (int*)malloc(P[0]*sizeof(int));oN1 = (int*)malloc(P[1]*sizeof(int));
263 oM0 = (int*)malloc(P[0]*sizeof(int));oM1 = (int*)malloc(P[1]*sizeof(int));
264 oK0 = (int*)malloc(P[0]*sizeof(int));oK1 = (int*)malloc(P[1]*sizeof(int));
266 for (i=0;i<P[0];i++)
268 #define EVENDIST
269 #ifndef EVENDIST
270 oN0[i]=i*ceil((double)NG/P[0]);
271 oM0[i]=i*ceil((double)MG/P[0]);
272 oK0[i]=i*ceil((double)KG/P[0]);
273 #else
274 oN0[i]=(NG*i)/P[0];
275 oM0[i]=(MG*i)/P[0];
276 oK0[i]=(KG*i)/P[0];
277 #endif
279 for (i=0;i<P[1];i++)
281 #ifndef EVENDIST
282 oN1[i]=i*ceil((double)NG/P[1]);
283 oM1[i]=i*ceil((double)MG/P[1]);
284 oK1[i]=i*ceil((double)KG/P[1]);
285 #else
286 oN1[i]=(NG*i)/P[1];
287 oM1[i]=(MG*i)/P[1];
288 oK1[i]=(KG*i)/P[1];
289 #endif
291 for (i=0;i<P[0]-1;i++)
293 N0[i]=oN0[i+1]-oN0[i];
294 M0[i]=oM0[i+1]-oM0[i];
295 K0[i]=oK0[i+1]-oK0[i];
297 N0[P[0]-1]=NG-oN0[P[0]-1];
298 M0[P[0]-1]=MG-oM0[P[0]-1];
299 K0[P[0]-1]=KG-oK0[P[0]-1];
300 for (i=0;i<P[1]-1;i++)
302 N1[i]=oN1[i+1]-oN1[i];
303 M1[i]=oM1[i+1]-oM1[i];
304 K1[i]=oK1[i+1]-oK1[i];
306 N1[P[1]-1]=NG-oN1[P[1]-1];
307 M1[P[1]-1]=MG-oM1[P[1]-1];
308 K1[P[1]-1]=KG-oK1[P[1]-1];
310 /* for step 1-3 the local N,M,K sizes of the transposed system
311 C: contiguous dimension, and nP: number of processor in subcommunicator
312 for that step */
315 pM[0] = M0[prank[0]];
316 oM[0] = oM0[prank[0]];
317 pK[0] = K1[prank[1]];
318 oK[0] = oK1[prank[1]];
319 C[0] = NG;
320 rC[0] = rNG;
321 if (!(flags&FFT5D_ORDER_YZ)) {
322 N[0] = vmax(N1,P[1]);
323 M[0] = M0[prank[0]];
324 K[0] = vmax(K1,P[1]);
325 pN[0] = N1[prank[1]];
326 iNout[0] = N1;
327 oNout[0] = oN1;
328 nP[0] = P[1];
329 C[1] = KG;
330 rC[1] =rKG;
331 N[1] = vmax(K0,P[0]);
332 pN[1] = K0[prank[0]];
333 iNin[1] = K1;
334 oNin[1] = oK1;
335 iNout[1] = K0;
336 oNout[1] = oK0;
337 M[1] = vmax(M0,P[0]);
338 pM[1] = M0[prank[0]];
339 oM[1] = oM0[prank[0]];
340 K[1] = N1[prank[1]];
341 pK[1] = N1[prank[1]];
342 oK[1] = oN1[prank[1]];
343 nP[1] = P[0];
344 C[2] = MG;
345 rC[2] = rMG;
346 iNin[2] = M0;
347 oNin[2] = oM0;
348 M[2] = vmax(K0,P[0]);
349 pM[2] = K0[prank[0]];
350 oM[2] = oK0[prank[0]];
351 K[2] = vmax(N1,P[1]);
352 pK[2] = N1[prank[1]];
353 oK[2] = oN1[prank[1]];
354 free(N0); free(oN0); /*these are not used for this order*/
355 free(M1); free(oM1); /*the rest is freed in destroy*/
356 } else {
357 N[0] = vmax(N0,P[0]);
358 M[0] = vmax(M0,P[0]);
359 K[0] = K1[prank[1]];
360 pN[0] = N0[prank[0]];
361 iNout[0] = N0;
362 oNout[0] = oN0;
363 nP[0] = P[0];
364 C[1] = MG;
365 rC[1] =rMG;
366 N[1] = vmax(M1,P[1]);
367 pN[1] = M1[prank[1]];
368 iNin[1] = M0;
369 oNin[1] = oM0;
370 iNout[1] = M1;
371 oNout[1] = oM1;
372 M[1] = N0[prank[0]];
373 pM[1] = N0[prank[0]];
374 oM[1] = oN0[prank[0]];
375 K[1] = vmax(K1,P[1]);
376 pK[1] = K1[prank[1]];
377 oK[1] = oK1[prank[1]];
378 nP[1] = P[1];
379 C[2] = KG;
380 rC[2] = rKG;
381 iNin[2] = K1;
382 oNin[2] = oK1;
383 M[2] = vmax(N0,P[0]);
384 pM[2] = N0[prank[0]];
385 oM[2] = oN0[prank[0]];
386 K[2] = vmax(M1,P[1]);
387 pK[2] = M1[prank[1]];
388 oK[2] = oM1[prank[1]];
389 free(N1); free(oN1); /*these are not used for this order*/
390 free(K0); free(oK0); /*the rest is freed in destroy*/
394 Difference between x-y-z regarding 2d decomposition is whether they are
395 distributed along axis 1, 2 or both
398 /* int lsize = fmax(N[0]*M[0]*K[0]*nP[0],N[1]*M[1]*K[1]*nP[1]); */
399 lsize = fft5d_fmax(N[0]*M[0]*K[0]*nP[0],fft5d_fmax(N[1]*M[1]*K[1]*nP[1],C[2]*M[2]*K[2]));
400 /* int lsize = fmax(C[0]*M[0]*K[0],fmax(C[1]*M[1]*K[1],C[2]*M[2]*K[2])); */
401 if (!(flags&FFT5D_NOMALLOC)) {
402 lin = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
403 lout = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
404 } else {
405 lin = *rlin;
406 lout = *rlout;
409 plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
412 #ifdef FFT5D_THREADS
413 #ifdef FFT5D_FFTW_THREADS
414 FFTW(init_threads)();
415 int nthreads;
416 #pragma omp parallel
418 #pragma omp master
420 nthreads = omp_get_num_threads();
423 if (prank[0] == 0 && prank[1] == 0)
425 printf("Running fftw on %d threads\n",nthreads);
427 FFTW(plan_with_nthreads)(nthreads);
428 #endif
429 #endif
431 #ifdef GMX_FFT_FFTW3 /*if not FFTW - then we don't do a 3d plan but insead only 1D plans */
432 if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1))) { /*don't do 3d plan in parallel or if in_place requested */
433 int fftwflags=FFTW_DESTROY_INPUT;
434 fftw_iodim dims[3];
435 int inNG=NG,outMG=MG,outKG=KG;
437 FFTW_LOCK;
438 if (!(flags&FFT5D_NOMEASURE)) fftwflags|=FFTW_MEASURE;
439 if (flags&FFT5D_REALCOMPLEX) {
440 if (!(flags&FFT5D_BACKWARD)) { /*input pointer is not complex*/
441 inNG*=2;
442 } else { /*output pointer is not complex*/
443 if (!(flags&FFT5D_ORDER_YZ)) outMG*=2;
444 else outKG*=2;
448 if (!(flags&FFT5D_BACKWARD)) {
449 dims[0].n = KG;
450 dims[1].n = MG;
451 dims[2].n = rNG;
453 dims[0].is = inNG*MG; /*N M K*/
454 dims[1].is = inNG;
455 dims[2].is = 1;
456 if (!(flags&FFT5D_ORDER_YZ)) {
457 dims[0].os = MG; /*M K N*/
458 dims[1].os = 1;
459 dims[2].os = MG*KG;
460 } else {
461 dims[0].os = 1; /*K N M*/
462 dims[1].os = KG*NG;
463 dims[2].os = KG;
465 } else {
466 if (!(flags&FFT5D_ORDER_YZ)) {
467 dims[0].n = NG;
468 dims[1].n = KG;
469 dims[2].n = rMG;
471 dims[0].is = 1;
472 dims[1].is = NG*MG;
473 dims[2].is = NG;
475 dims[0].os = outMG*KG;
476 dims[1].os = outMG;
477 dims[2].os = 1;
478 } else {
479 dims[0].n = MG;
480 dims[1].n = NG;
481 dims[2].n = rKG;
483 dims[0].is = NG;
484 dims[1].is = 1;
485 dims[2].is = NG*MG;
487 dims[0].os = outKG*NG;
488 dims[1].os = outKG;
489 dims[2].os = 1;
492 if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
493 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
494 /*howmany*/ 0, /*howmany_dims*/0 ,
495 (real*)lin, (FFTW(complex) *)lout,
496 /*flags*/ fftwflags);
497 } else if ((flags&FFT5D_REALCOMPLEX) && (flags&FFT5D_BACKWARD)) {
498 plan->p3d = FFTW(plan_guru_dft_c2r)(/*rank*/ 3, dims,
499 /*howmany*/ 0, /*howmany_dims*/0 ,
500 (FFTW(complex) *)lin, (real*)lout,
501 /*flags*/ fftwflags);
502 } else {
503 plan->p3d = FFTW(plan_guru_dft)(/*rank*/ 3, dims,
504 /*howmany*/ 0, /*howmany_dims*/0 ,
505 (FFTW(complex) *)lin, (FFTW(complex) *)lout,
506 /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
508 FFTW_UNLOCK;
510 if (!plan->p3d) { /* for decomposition and if 3d plan did not work */
511 #endif /* GMX_FFT_FFTW3 */
512 for (s=0;s<3;s++) {
513 if (debug)
515 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
516 s,rC[s],M[s],pK[s],C[s],lsize);
518 if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
519 gmx_fft_init_many_1d_real( &plan->p1d[s], rC[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
520 } else {
521 gmx_fft_init_many_1d ( &plan->p1d[s], C[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
524 #ifdef GMX_FFT_FFTW3
526 #endif
527 if ((flags&FFT5D_ORDER_YZ)) { /*plan->cart is in the order of transposes */
528 plan->cart[0]=comm[0]; plan->cart[1]=comm[1];
529 } else {
530 plan->cart[1]=comm[0]; plan->cart[0]=comm[1];
532 #ifdef FFT5D_MPI_TRANSPOSE
533 FFTW_LOCK
534 for (s=0;s<2;s++) {
535 if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ)))
536 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
537 else
538 plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
540 FFTW_UNLOCK
541 #endif
544 plan->lin=lin;
545 plan->lout=lout;
547 plan->NG=NG;plan->MG=MG;plan->KG=KG;
549 for (s=0;s<3;s++) {
550 plan->N[s]=N[s];plan->M[s]=M[s];plan->K[s]=K[s];plan->pN[s]=pN[s];plan->pM[s]=pM[s];plan->pK[s]=pK[s];
551 plan->oM[s]=oM[s];plan->oK[s]=oK[s];
552 plan->C[s]=C[s];plan->rC[s]=rC[s];
553 plan->iNin[s]=iNin[s];plan->oNin[s]=oNin[s];plan->iNout[s]=iNout[s];plan->oNout[s]=oNout[s];
555 for (s=0;s<2;s++) {
556 plan->P[s]=nP[s];plan->coor[s]=prank[s];
559 /* plan->fftorder=fftorder;
560 plan->direction=direction;
561 plan->realcomplex=realcomplex;
563 plan->flags=flags;
564 *rlin=lin;
565 *rlout=lout;
566 return plan;
570 enum order {
571 XYZ,
572 XZY,
573 YXZ,
574 YZX,
575 ZXY,
581 /*here x,y,z and N,M,K is in rotated coordinate system!!
582 x (and N) is mayor (consecutive) dimension, y (M) middle and z (K) major
583 maxN,maxM,maxK is max size of local data
584 pN, pM, pK is local size specific to current processor (only different to max if not divisible)
585 NG, MG, KG is size of global data*/
586 static void splitaxes(t_complex* lout,const t_complex* lin,
587 int maxN,int maxM,int maxK, int pN, int pM, int pK,
588 int P,int NG,int *N, int* oN) {
589 int x,y,z,i;
590 int in_i,out_i,in_z,out_z,in_y,out_y;
592 for (i=0;i<P;i++) { /*index cube along long axis*/
593 in_i = i*maxN*maxM*maxK;
594 out_i = oN[i];
595 #ifdef FFT5D_THREADS
596 #pragma omp parallel for private(in_z,out_z,y,in_y,out_y,x)
597 #endif
598 for (z=0;z<pK;z++) { /*3. z l*/
599 in_z = in_i + z*maxN*maxM;
600 out_z = out_i + z*NG*pM;
601 for (y=0;y<pM;y++) { /*2. y k*/
602 in_y = in_z + y*maxN;
603 out_y = out_z + y*NG;
604 for (x=0;x<N[i];x++) { /*1. x j*/
605 lout[in_y+x] = lin[out_y+x];
606 /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
607 /*before split data contiguos - thus if different processor get different amount oN is different*/
614 /*make axis contiguous again (after AllToAll) and also do local transpose*/
615 /*transpose mayor and major dimension
616 variables see above
617 the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
618 N,M,K local dimensions
619 KG global size*/
620 static void joinAxesTrans13(t_complex* lin,const t_complex* lout,
621 int maxN,int maxM,int maxK,int pN, int pM, int pK,
622 int P,int KG, int* K, int* oK)
624 int i,x,y,z;
625 int in_i,out_i,in_x,out_x,in_z,out_z;
627 for (i=0;i<P;i++) { /*index cube along long axis*/
628 in_i = oK[i];
629 out_i = i*maxM*maxN*maxK;
630 #ifdef FFT5D_THREADS
631 #pragma omp parallel for private(in_x,out_x,z,in_z,out_z,y)
632 #endif
633 for (x=0;x<pN;x++) { /*1.j*/
634 in_x = in_i + x*KG*pM;
635 out_x = out_i + x;
636 for (z=0;z<K[i];z++) { /*3.l*/
637 in_z = in_x + z;
638 out_z = out_x + z*maxM*maxN;
639 for (y=0;y<pM;y++) { /*2.k*/
640 lin[in_z+y*KG] = lout[out_z+y*maxN];
647 /*make axis contiguous again (after AllToAll) and also do local transpose
648 tranpose mayor and middle dimension
649 variables see above
650 the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
651 N,M,K local size
652 MG, global size*/
653 static void joinAxesTrans12(t_complex* lin,const t_complex* lout,int maxN,int maxM,int maxK,int pN, int pM, int pK,
654 int P,int MG, int* M, int* oM) {
655 int i,z,y,x;
656 int in_i,out_i,in_z,out_z,in_x,out_x;
658 for (i=0;i<P;i++) { /*index cube along long axis*/
659 in_i = oM[i];
660 out_i = i*maxM*maxN*maxK;
661 #ifdef FFT5D_THREADS
662 #pragma omp parallel for private(in_z,out_z,in_x,out_x,x,y)
663 #endif
664 for (z=0;z<pK;z++) {
665 in_z = in_i + z*MG*pN;
666 out_z = out_i + z*maxM*maxN;
667 for (x=0;x<pN;x++) {
668 in_x = in_z + x*MG;
669 out_x = out_z + x;
670 for (y=0;y<M[i];y++) {
671 lin[in_x+y] = lout[out_x+y*maxN];
679 static void rotate(int x[]) {
680 int t=x[0];
681 /* x[0]=x[2];
682 x[2]=x[1];
683 x[1]=t;*/
684 x[0]=x[1];
685 x[1]=x[2];
686 x[2]=t;
689 /*compute the offset to compare or print transposed local data in original input coordinates
690 xs matrix dimension size, xl dimension length, xc decomposition offset
691 s: step in computation = number of transposes*/
692 static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int NG[], int s) {
693 /* int direction = plan->direction;
694 int fftorder = plan->fftorder;*/
696 int o;
697 int pos[3],i;
698 int *pM=plan->pM, *pK=plan->pK, *oM=plan->oM, *oK=plan->oK,
699 *C=plan->C, *rC=plan->rC;
701 NG[0]=plan->NG;NG[1]=plan->MG;NG[2]=plan->KG;
703 if (!(plan->flags&FFT5D_ORDER_YZ)) {
704 switch (s) {
705 case 0: o=XYZ; break;
706 case 1: o=ZYX; break;
707 case 2: o=YZX; break;
708 default: assert(0);
710 } else {
711 switch (s) {
712 case 0: o=XYZ; break;
713 case 1: o=YXZ; break;
714 case 2: o=ZXY; break;
715 default: assert(0);
719 switch (o) {
720 case XYZ:pos[0]=1;pos[1]=2;pos[2]=3;break;
721 case XZY:pos[0]=1;pos[1]=3;pos[2]=2;break;
722 case YXZ:pos[0]=2;pos[1]=1;pos[2]=3;break;
723 case YZX:pos[0]=3;pos[1]=1;pos[2]=2;break;
724 case ZXY:pos[0]=2;pos[1]=3;pos[2]=1;break;
725 case ZYX:pos[0]=3;pos[1]=2;pos[2]=1;break;
727 /*if (debug) printf("pos: %d %d %d\n",pos[0],pos[1],pos[2]);*/
729 /*xs, xl give dimension size and data length in local transposed coordinate system
730 for 0(/1/2): x(/y/z) in original coordinate system*/
731 for (i=0;i<3;i++) {
732 switch (pos[i]) {
733 case 1: xs[i]=1; xc[i]=0; xl[i]=C[s];break;
734 case 2: xs[i]=C[s]; xc[i]=oM[s]; xl[i]=pM[s];break;
735 case 3: xs[i]=C[s]*pM[s];xc[i]=oK[s]; xl[i]=pK[s];break;
738 /*input order is different for test program to match FFTW order
739 (important for complex to real)*/
740 if (plan->flags&FFT5D_BACKWARD) {
741 rotate(xs);
742 rotate(xl);
743 rotate(xc);
744 rotate(NG);
745 if (plan->flags&FFT5D_ORDER_YZ) {
746 rotate(xs);
747 rotate(xl);
748 rotate(xc);
749 rotate(NG);
752 if (plan->flags&FFT5D_REALCOMPLEX && ((!(plan->flags&FFT5D_BACKWARD) && s==0) || (plan->flags&FFT5D_BACKWARD && s==2))) {
753 xl[0] = rC[s];
757 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
758 int x,y,z,l;
759 int *coor = plan->coor;
760 int xs[3],xl[3],xc[3],NG[3];
761 int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
762 compute_offsets(plan,xs,xl,xc,NG,s);
763 fprintf(debug,txt,coor[0],coor[1],s);
764 /*printf("xs: %d %d %d, xl: %d %d %d\n",xs[0],xs[1],xs[2],xl[0],xl[1],xl[2]);*/
765 for(z=0;z<xl[2];z++) {
766 for(y=0;y<xl[1];y++) {
767 fprintf(debug,"%d %d: ",coor[0],coor[1]);
768 for (x=0;x<xl[0];x++) {
769 for (l=0;l<ll;l++) {
770 fprintf(debug,"%f ",((real*)lin)[(z*xs[2]+y*xs[1])*2+(x*xs[0])*ll+l]);
772 fprintf(debug,",");
774 fprintf(debug,"\n");
779 void fft5d_execute(fft5d_plan plan,fft5d_time times) {
780 t_complex *lin = plan->lin;
781 t_complex *lout = plan->lout;
783 gmx_fft_t *p1d=plan->p1d;
784 #ifdef FFT5D_MPI_TRANSPOSE
785 FFTW(plan) *mpip=plan->mpip;
786 #endif
787 #ifdef GMX_MPI
788 MPI_Comm *cart=plan->cart;
789 #endif
791 double time_fft=0,time_local=0,time_mpi[2]={0},time=0;
792 int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
793 *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
794 int s=0;
797 #ifdef GMX_FFT_FFTW3
798 if (plan->p3d) {
799 if (times!=0)
800 time=MPI_Wtime();
801 FFTW(execute)(plan->p3d);
802 if (times!=0)
803 times->fft+=MPI_Wtime()-time;
804 return;
806 #endif
808 /*lin: x,y,z*/
809 if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: copy in lin\n", s, plan);
810 for (s=0;s<2;s++) {
811 if (times!=0)
812 time=MPI_Wtime();
814 if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0) {
815 gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
816 } else {
817 gmx_fft_many_1d( p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin,lout);
819 if (times!=0)
820 time_fft+=MPI_Wtime()-time;
822 if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
824 #ifdef GMX_MPI
825 if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] !=0 && P[s]>1 )
827 if (times!=0)
828 time=MPI_Wtime();
829 /*prepare for AllToAll
830 1. (most outer) axes (x) is split into P[s] parts of size N[s]
831 for sending*/
832 splitaxes(lin,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s]);
834 if (times!=0)
836 time_local+=MPI_Wtime()-time;
838 /*send, recv*/
839 time=MPI_Wtime();
842 #ifdef FFT5D_MPI_TRANSPOSE
843 FFTW(execute)(mpip[s]);
844 #else
845 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
846 MPI_Alltoall(lin,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
847 else
848 MPI_Alltoall(lin,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
849 #endif /*FFT5D_MPI_TRANSPOSE*/
850 if (times!=0)
851 time_mpi[s]=MPI_Wtime()-time;
853 #endif /*GMX_MPI*/
856 if (times!=0)
857 time=MPI_Wtime();
858 /*bring back in matrix form
859 thus make new 1. axes contiguos
860 also local transpose 1 and 2/3 */
861 if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
862 joinAxesTrans13(lin,lout,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
863 else
864 joinAxesTrans12(lin,lout,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
865 if (times!=0)
866 time_local+=MPI_Wtime()-time;
868 if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
870 /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
873 if (times!=0)
874 time=MPI_Wtime();
875 if (plan->flags&FFT5D_INPLACE) lout=lin;
876 if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
877 gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
878 } else {
879 gmx_fft_many_1d( p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD, lin,lout);
882 if (times!=0)
883 time_fft+=MPI_Wtime()-time;
884 if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
885 /*if (debug) print_localdata(lout, "%d %d: FFT in y\n", N1, M, K0, YZX, coor);*/
887 if (times!=0) {
888 times->fft+=time_fft;
889 times->local+=time_local;
890 times->mpi2+=time_mpi[1];
891 times->mpi1+=time_mpi[0];
895 void fft5d_destroy(fft5d_plan plan) {
896 int s;
897 for (s=0;s<3;s++) {
898 gmx_many_fft_destroy(plan->p1d[s]);
899 if (plan->iNin[s]) {
900 free(plan->iNin[s]);
901 plan->iNin[s]=0;
903 if (plan->oNin[s]) {
904 free(plan->oNin[s]);
905 plan->oNin[s]=0;
907 if (plan->iNout[s]) {
908 free(plan->iNout[s]);
909 plan->iNout[s]=0;
911 if (plan->oNout[s]) {
912 free(plan->oNout[s]);
913 plan->oNout[s]=0;
916 #ifdef GMX_FFT_FFTW3
917 FFTW_LOCK;
918 #ifdef FFT5D_MPI_TRANSPOS
919 for (s=0;s<2;s++)
920 FFTW(destroy_plan)(plan->mpip[s]);
921 #endif /* FFT5D_MPI_TRANSPOS */
922 #endif /* GMX_FFT_FFTW3 */
924 /*We can't free lin/lout here - is allocated by gmx_calloc_aligned which can't be freed*/
927 #ifdef FFT5D_THREADS
928 #ifdef FFT5D_FFTW_THREADS
929 FFTW(cleanup_threads)();
930 #endif
931 #endif
933 free(plan);
936 /*Is this better than direct access of plan? enough data?
937 here 0,1 reference divided by which processor grid dimension (not FFT step!)*/
938 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor) {
939 *N1=plan->N[0];
940 *M0=plan->M[0];
941 *K1=plan->K[0];
942 *K0=plan->N[1];
944 *coor=plan->coor;
948 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting
949 of processor dimensions*/
950 fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout) {
951 MPI_Comm cart[2]={0};
952 #ifdef GMX_MPI
953 int size=1,prank=0;
954 int P[2];
955 int coor[2];
956 int wrap[]={0,0};
957 MPI_Comm gcart;
958 int rdim1[] = {0,1}, rdim2[] = {1,0};
960 MPI_Comm_size(comm,&size);
961 MPI_Comm_rank(comm,&prank);
963 if (P0==0) P0 = lfactor(size);
964 if (size%P0!=0) {
965 if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
966 P0 = lfactor(size);
969 P[0] = P0; P[1]=size/P0; /*number of processors in the two dimensions*/
971 /*Difference between x-y-z regarding 2d decomposition is whether they are
972 distributed along axis 1, 2 or both*/
974 MPI_Cart_create(comm,2,P,wrap,1,&gcart); /*parameter 4: value 1: reorder*/
975 MPI_Cart_get(gcart,2,P,wrap,coor);
976 MPI_Cart_sub(gcart, rdim1 , &cart[0]);
977 MPI_Cart_sub(gcart, rdim2 , &cart[1]);
978 #endif
979 return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout);
984 /*prints in original coordinate system of data (as the input to FFT)*/
985 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normalize) {
986 int xs[3],xl[3],xc[3],NG[3];
987 int x,y,z,l;
988 int *coor = plan->coor;
989 int ll=2; /*compare ll values per element (has to be 2 for complex)*/
990 if (plan->flags&FFT5D_REALCOMPLEX && plan->flags&FFT5D_BACKWARD)
992 ll=1;
995 compute_offsets(plan,xs,xl,xc,NG,2);
996 if (plan->flags&FFT5D_DEBUG) printf("Compare2\n");
997 for (z=0;z<xl[2];z++) {
998 for(y=0;y<xl[1];y++) {
999 if (plan->flags&FFT5D_DEBUG) printf("%d %d: ",coor[0],coor[1]);
1000 for (x=0;x<xl[0];x++) {
1001 for (l=0;l<ll;l++) { /*loop over real/complex parts*/
1002 real a,b;
1003 a=((real*)lin)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1004 if (normalize) a/=plan->rC[0]*plan->rC[1]*plan->rC[2];
1005 if (!bothLocal)
1006 b=((real*)in)[((z+xc[2])*NG[0]*NG[1]+(y+xc[1])*NG[0])*2+(x+xc[0])*ll+l];
1007 else
1008 b=((real*)in)[(z*xs[2]+y*xs[1])*2+x*xs[0]*ll+l];
1009 if (plan->flags&FFT5D_DEBUG) {
1010 printf("%f %f, ",a,b);
1011 } else {
1012 if (fabs(a-b)>2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS) {
1013 printf("result incorrect on %d,%d at %d,%d,%d: FFT5D:%f reference:%f\n",coor[0],coor[1],x,y,z,a,b);
1015 /* assert(fabs(a-b)<2*NG[0]*NG[1]*NG[2]*GMX_REAL_EPS);*/
1018 if (plan->flags&FFT5D_DEBUG) printf(",");
1020 if (plan->flags&FFT5D_DEBUG) printf("\n");