1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * Gromacs 4.0 Copyright (c) 1991-2003
5 * David van der Spoel, Erik Lindahl, University of Groningen.
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
12 * To help us fund GROMACS development, we humbly ask that you cite
13 * the research papers on the package. Check out http://www.gromacs.org
16 * Grim Ragnarok Overthrows Midgard Amidst Cursing Silence
34 #include "gmx_fatal.h"
36 /* Since c has no support of templates, we get around the type restrictions
37 with macros. Our custom macro names obscure the float vs double interfaces
40 #define ACML_FFT1DX zfft1dx
41 #define ACML_FFT2DX zfft2dx
42 #define ACML_FFT3DX zfft3dy
43 #define ACML_FFT1DMX zfft1mx
45 #define ACML_RCFFT1D dzfft
46 #define ACML_CRFFT1D zdfft
47 #define ACML_RCFFT1DM dzfftm
48 #define ACML_CRFFT1DM zdfftm
50 #define acmlComplex doublecomplex
53 #define ACML_FFT1DX cfft1dx
54 #define ACML_FFT2DX cfft2dx
55 #define ACML_FFT3DX cfft3dy
56 #define ACML_FFT1DMX cfft1mx
58 #define ACML_RCFFT1D scfft
59 #define ACML_CRFFT1D csfft
60 #define ACML_RCFFT1DM scfftm
61 #define ACML_CRFFT1DM csfftm
63 #define acmlComplex complex
66 void cPrintArray( t_complex
* arr
, int outer
, int mid
, int inner
)
71 for( i
= 0; i
< outer
; ++i
)
73 for( j
= 0; j
< mid
; ++j
)
75 for( k
= 0; k
< inner
; ++k
)
77 printf("%f,%f ", arr
[i
*inner
*mid
+j
*inner
+k
].re
, arr
[i
*inner
*mid
+j
*inner
+k
].im
);
87 void rPrintArray( real
* arr
, int outer
, int mid
, int inner
)
92 for( i
= 0; i
< outer
; ++i
)
94 for( j
= 0; j
< mid
; ++j
)
96 for( k
= 0; k
< inner
; ++k
)
98 printf("%f ", arr
[i
*inner
*mid
+j
*inner
+k
] );
108 /* Contents of the AMD ACML FFT fft datatype.
110 * Note that this is one of several possible implementations of gmx_fft_t.
112 * The ACML _API_ supports 1D,2D, and 3D transforms, including real-to-complex.
113 * Unfortunately the actual library implementation does not support 2D&3D real
114 * transforms as of version 4.2 In addition, ACML outputs their hermitian data
115 * in such a form that it can't be fed straight back into their complex API's.
117 * To work around this we roll our own 2D and 3D real-to-complex transforms,
118 * using separate X/Y/Z handles defined to perform (ny*nz), (nx*nz), and
119 * (nx*ny) transforms at once when necessary. To perform strided multiple
120 * transforms out-of-place (i.e., without padding in the last dimension)
121 * on the fly we also need to separate the forward and backward
122 * handles for real-to-complex/complex-to-real data permutation.
124 * So, the handles are enumerated as follows:
126 * 1D FFT (real too): Index 0 is the handle for the entire FFT
127 * 2D complex FFT: Index 0 is the handle for the entire FFT
128 * 3D complex FFT: Index 0 is the handle for the entire FFT
129 * 2D, real FFT: 0=FFTx, 1=FFTrc, 2=FFTcr handle
130 * 3D, real FFT: 0=FFTx, 1=FFTy, 2=FFTrc, 3=FFTcr handle
132 * AMD people reading this: Learn from FFTW what a good interface looks like :-)
139 * r2c = index0, c2r = index1
144 int ndim
; /**< Number of dimensions in FFT */
145 int nx
; /**< Length of X transform */
146 int ny
; /**< Length of Y transform */
147 int nz
; /**< Length of Z transform */
148 int real_fft
; /**< 1 if real FFT, otherwise 0 */
151 /* ACML's real FFT support leaves the data in a swizzled 'hermitian' format. The
152 problem with this is that you can't just feed this data back into ACML :( A
153 pre-processing step is required to transform the hermitian swizzled complex
154 values into unswizzled complex values
155 -This function assumes that the complex conjugates are not expanded; the
156 calling function should not assume that they exist.
157 -This function pads all real numbers with 0's for imaginary components.
159 void hermitianUnPacking( float scale
, int row
, int col
, void* src
, int srcPitch
, void* dst
)
162 int loopCount
= (col
-1)/2;
163 int hermLength
= mid
+ 1;
167 /* Currently this function is not written to handle aliased input/output pointers */
168 assert( src
!= dst
);
170 /* These two pointers could be aliased on top of each other; be careful */
171 real
* realData
= (real
*)src
;
172 t_complex
* hermData
= (t_complex
*)dst
;
174 /* We have to expand the data from real to complex, which will clobber data if you start
175 * from the beginning. For this reason, we start at the last array and work backwards,
176 * however, we skip the very first row (nFFT == 0) because we still have data collision
177 * on that row. We copy the first row into a temporary buffer.
179 for( nFFT
= row
-1; nFFT
>= 0; --nFFT
)
181 realData
= (real
*)src
+ (nFFT
*srcPitch
);
182 hermData
= (t_complex
*)dst
+ (nFFT
*hermLength
);
184 /* The first complex number is real valued */
185 t_complex tmp
= { realData
[0]*scale
, 0 };
189 for(i
= 1; i
<= loopCount
; ++i
)
191 t_complex tmp
= { realData
[i
]*scale
, realData
[col
-i
]*scale
};
195 /* A little cleanup for even lengths */
196 if( loopCount
!= mid
)
198 /* The n/2 number is real valued */
199 t_complex tmp
= { realData
[mid
]*scale
, 0 };
207 /* ACML's real FFT support requires the data in a swizzled 'hermitian' format.
208 This is a non-standard format, and requires us to manually swizzle the data :(
209 A pre-processing step is required to transform the complex values into the
210 swizzled hermitian format
212 void hermitianPacking( float scale
, int row
, int col
, void* src
, void* dst
, int dstPitch
)
215 int loopCount
= (col
-1)/2;
216 int hermLength
= mid
+ 1;
223 /* Currently this function is not written to handle aliased input/output pointers */
224 assert( src
!= dst
);
226 for( nFFT
= 0; nFFT
< row
; ++nFFT
)
228 realData
= (real
*)dst
+ (nFFT
*dstPitch
);
229 hermData
= (t_complex
*)src
+ (nFFT
*hermLength
);
231 /* The first complex number is real valued */
232 realData
[0] = hermData
[0].re
* scale
;
234 /* ACML's complex to real function is documented as only handling 'forward'
235 * transforms, which mean we have to manually conjugate the data before doing
236 * the backwards transform. */
238 for(i
= 1; i
<= loopCount
; ++i
)
240 realData
[i
] = hermData
[i
].re
* scale
;
241 realData
[col
-i
] = -hermData
[i
].im
* scale
;
244 /* A little cleanup for even lengths */
245 if( loopCount
!= mid
)
247 /* The n/2 number is real valued */
248 realData
[mid
] = hermData
[mid
].re
* scale
;
256 /* Gromacs expects results that are returned to be in a packed form; No
257 complex conjugate's. This routine takes a 2D array of complex data,
258 and compresses it to fit into 'real' form by removing all the complex
259 conjugates and any known imaginary 0's. Data is expected in row-major
260 form, with ny describing the length of a row.
262 void compressConj2D( int nX
, int nY
, void* src
, void* dst
)
264 /* These two pointers are aliased on top of each other; be careful */
265 real
* realData
= (real
*)dst
;
266 t_complex
* complexData
;
268 int halfRows
= nX
/2 + 1;
269 int halfCols
= nY
/2 + 1;
270 int rowOdd
= nX
& 0x1;
271 int colOdd
= nY
& 0x1;
275 for( nRow
= 0; nRow
< nX
; ++nRow
)
277 complexData
= (t_complex
*)src
+ (nRow
*halfCols
);
280 for( nCol
= 0; nCol
< halfCols
; ++nCol
)
282 if(( nRow
== 0) && (nCol
== 0 ))
284 /* The complex number is real valued */
285 realData
[0] = complexData
[nCol
].re
;
288 /* Last column is real if even column length */
289 else if( (nRow
== 0) && ( !colOdd
&& ( nCol
== (halfCols
-1) ) ) )
291 /* The complex number is real valued */
292 realData
[0] = complexData
[nCol
].re
;
295 /* The middle value of the first column is real if we have an even number of rows and
296 * the last column of middle row is real if we have an even number of rows and columns */
297 else if( !rowOdd
&& ( ( nRow
== (halfRows
-1) ) && ( ( nCol
== 0 ) || ( !colOdd
&& ( nCol
== (halfCols
-1) ) ) ) ) )
299 /* The complex number is real valued */
300 realData
[0] = complexData
[nCol
].re
;
303 else if( (nCol
== 0) || ( !colOdd
&& ( nCol
== (halfCols
-1) ) ) )
305 /* Last half of real columns are conjugates */
306 if( nRow
< halfRows
)
308 /* Copy a complex number, which is two reals */
309 realData
[0] = complexData
[nCol
].re
;
310 realData
[1] = complexData
[nCol
].im
;
316 /* Copy a complex number, which is two reals */
317 realData
[0] = complexData
[nCol
].re
;
318 realData
[1] = complexData
[nCol
].im
;
327 /* Gromacs expects results that are returned to be in a packed form; No
328 complex conjugate's. This routine takes a 2D array of real data,
329 and uncompresses it to fit into 'complex' form by creating all the complex
330 conjugates and any known imaginary 0's. Data is expected in row-major
331 form, with ny describing the length of a row.
333 void unCompressConj2D( int nX
, int nY
, void* src
, void* dst
)
335 /* These two pointers are aliased on top of each other; be careful */
336 real
* realData
= (real
*)src
;
337 t_complex
* complexData
= (t_complex
*)dst
;
339 int halfRows
= nX
/2 + 1;
340 int halfCols
= nY
/2 + 1;
341 int rowOdd
= nX
& 0x1;
342 int colOdd
= nY
& 0x1;
345 for( nRow
= 0; nRow
< nX
; ++nRow
)
348 for( nCol
= 0; nCol
< halfCols
; ++nCol
)
350 int iIndex
= nRow
*halfCols
+ nCol
;
352 if(( nRow
== 0) && (nCol
== 0 ))
354 /* The complex number is real valued */
355 complexData
[iIndex
].re
= realData
[0];
356 complexData
[iIndex
].im
= 0;
359 /* Last column is real if even column length */
360 else if( (nRow
== 0) && ( !colOdd
&& ( nCol
== (halfCols
-1) ) ) )
362 /* The complex number is real valued */
363 complexData
[iIndex
].re
= realData
[0];
364 complexData
[iIndex
].im
= 0;
367 /* The middle value of the first column is real if we have an even number of rows and
368 * the last column of middle row is real if we have an even number of rows and columns */
369 else if( !rowOdd
&& ( ( nRow
== (halfRows
-1) ) && ( ( nCol
== 0 ) || ( !colOdd
&& ( nCol
== (halfCols
-1) ) ) ) ) )
371 /* The complex number is real valued */
372 complexData
[iIndex
].re
= realData
[0];
373 complexData
[iIndex
].im
= 0;
376 else if( (nCol
== 0) || (!colOdd
&& ( nCol
== (halfCols
-1) ) ) )
378 /* Last half of real columns are conjugates */
379 if( nRow
< halfRows
)
381 /* Copy a complex number, which is two reals */
382 complexData
[iIndex
].re
= realData
[0];
383 complexData
[iIndex
].im
= realData
[1];
388 int oIndex
= (nX
-nRow
)*halfCols
+ nCol
;
389 complexData
[iIndex
].re
= complexData
[oIndex
].re
;
390 complexData
[iIndex
].im
= -complexData
[oIndex
].im
;
395 /* Copy a complex number, which is two reals */
396 complexData
[iIndex
].re
= realData
[0];
397 complexData
[iIndex
].im
= realData
[1];
406 /* Support routine for the 1D array tranforms. ACML does not support a MODE
407 * flag on the real/complex interface. It assumes a 'forward' transform both
408 * directions, so that requires a manual conjugation of the imaginary comps. */
409 void negateConj( int len
, void* src
)
411 real
* imag
= (real
*)src
;
412 int half
= len
/2 + 1;
415 for( i
= half
; i
< len
; ++i
)
424 gmx_fft_init_1d(gmx_fft_t
* pfft
,
426 enum gmx_fft_flag flags
)
430 acmlComplex
* comm
= NULL
;
436 gmx_fatal(FARGS
,"Invalid opaque FFT datatype pointer.");
441 if( (fft
= malloc(sizeof(struct gmx_fft
))) == NULL
)
446 // Single precision requires 5*nx+100
447 // Double precision requires 3*nx+100
448 if( sizeof( acmlComplex
) == 16 )
449 commSize
= (3*nx
+100)*sizeof( acmlComplex
);
451 commSize
= (5*nx
+100)*sizeof( acmlComplex
);
453 // Allocate communication work array
454 if( (comm
= (acmlComplex
*)malloc( commSize
) ) == NULL
)
459 // Initialize communication work array
460 ACML_FFT1DX( 100, 1.0f
, TRUE
, nx
, NULL
, 1, NULL
, 1, comm
, &info
);
464 gmx_fatal(FARGS
,"Error initializing ACML FFT; status=%d", info
);
465 gmx_fft_destroy( fft
);
479 gmx_fft_init_1d_real(gmx_fft_t
* pfft
,
481 enum gmx_fft_flag flags
)
491 gmx_fatal(FARGS
,"Invalid opaque FFT datatype pointer.");
496 if( (fft
= malloc(sizeof(struct gmx_fft
))) == NULL
)
502 commSize
= (3*nx
+100)*sizeof( float );
504 // Allocate communication work array, r2c
505 if( (commRC
= (real
*)malloc( commSize
) ) == NULL
)
510 // Allocate communication work array, c2r
511 if( (commCR
= (real
*)malloc( commSize
) ) == NULL
)
516 // Initialize communication work array
517 ACML_RCFFT1D( 100, nx
, NULL
, (real
*)commRC
, &info
);
521 gmx_fatal(FARGS
,"Error initializing ACML FFT; status=%d", info
);
522 gmx_fft_destroy( fft
);
526 // Initialize communication work array
527 ACML_CRFFT1D( 100, nx
, NULL
, (real
*)commCR
, &info
);
531 gmx_fatal(FARGS
,"Error initializing ACML FFT; status=%d", info
);
532 gmx_fft_destroy( fft
);
536 /* Allocate scratch work array that ACML uses to splat from hermitian complex format to
537 * full complex format */
538 if( (fft
->realScratch
= (acmlComplex
*)malloc( nx
*sizeof( acmlComplex
) ) ) == NULL
)
546 fft
->comm
[0] = commRC
;
547 fft
->comm
[1] = commCR
;
554 gmx_fft_init_2d(gmx_fft_t
* pfft
,
557 enum gmx_fft_flag flags
)
561 acmlComplex
* comm
= NULL
;
567 gmx_fatal(FARGS
,"Invalid opaque FFT datatype pointer.");
572 if( (fft
= malloc(sizeof(struct gmx_fft
))) == NULL
)
577 // Single precision requires nx*ny+5*(nx+ny)
578 // Double precision requires nx*ny+3*(nx+ny)
579 if( sizeof( acmlComplex
) == 16 )
580 commSize
= (nx
*ny
+3*(nx
+ny
)+200)*sizeof( acmlComplex
);
582 commSize
= (nx
*ny
+5*(nx
+ny
)+200)*sizeof( acmlComplex
);
584 // Allocate communication work array
585 if( (comm
= (acmlComplex
*)malloc( commSize
) ) == NULL
)
590 // Initialize communication work array
591 ACML_FFT2DX( 100, 1.0f
, TRUE
, TRUE
, nx
, ny
, NULL
, 1, nx
, NULL
, 1, nx
, (acmlComplex
*)comm
, &info
);
595 gmx_fatal(FARGS
,"Error initializing ACML FFT; status=%d", info
);
596 gmx_fft_destroy( fft
);
611 gmx_fft_init_2d_real(gmx_fft_t
* pfft
,
614 enum gmx_fft_flag flags
)
618 acmlComplex
* comm
= NULL
;
626 gmx_fatal(FARGS
,"Invalid opaque FFT datatype pointer.");
631 if( (fft
= malloc(sizeof(struct gmx_fft
))) == NULL
)
638 /* Roll our own 2D real transform using multiple transforms in ACML,
639 * since the current ACML versions does not support our storage format,
640 * and all but the most recent don't even have 2D real FFTs.
643 // Single precision requires 5*nx+100
644 // Double precision requires 3*nx+100
645 if( sizeof( acmlComplex
) == 16 )
646 commSize
= (3*nx
+100)*sizeof( acmlComplex
);
648 commSize
= (5*nx
+100)*sizeof( acmlComplex
);
650 // Allocate communication work array
651 if( (comm
= (acmlComplex
*)malloc( commSize
) ) == NULL
)
656 // Initialize communication work array
657 ACML_FFT1DMX( 100, 1.0f
, FALSE
, nyc
, nx
, NULL
, nyc
, 1, NULL
, nyc
, 1, comm
, &info
);
661 gmx_fatal(FARGS
,"Error initializing ACML FFT; status=%d", info
);
662 gmx_fft_destroy( fft
);
666 commSize
= (3*ny
+100)*sizeof( real
);
667 // Allocate communication work array
668 if( (commRC
= (real
*)malloc( commSize
) ) == NULL
)
673 // TODO: Is there no MODE or PLAN for multiple hermetian sequences?
674 // Initialize communication work array
675 ACML_RCFFT1D( 100, ny
, NULL
, commRC
, &info
);
679 gmx_fatal(FARGS
,"Error initializing ACML FFT; status=%d", info
);
680 gmx_fft_destroy( fft
);
684 commSize
= (3*ny
+100)*sizeof( real
);
685 // Allocate communication work array
686 if( (commCR
= (real
*)malloc( commSize
) ) == NULL
)
691 // TODO: Is there no MODE or PLAN for multiple hermetian sequences?
692 // Initialize communication work array
693 ACML_CRFFT1D( 100, ny
, NULL
, commCR
, &info
);
697 gmx_fatal(FARGS
,"Error initializing ACML FFT; status=%d", info
);
698 gmx_fft_destroy( fft
);
702 /* Allocate scratch work array that ACML uses to splat from hermitian complex format to
703 * full complex format */
704 if( (fft
->realScratch
= (acmlComplex
*)malloc( (nx
*ny
)*sizeof( acmlComplex
) ) ) == NULL
)
714 fft
->comm
[1] = commRC
;
715 fft
->comm
[2] = commCR
;
722 gmx_fft_init_3d(gmx_fft_t
* pfft
,
726 enum gmx_fft_flag flags
)
730 acmlComplex
* comm
= NULL
;
736 gmx_fatal(FARGS
,"Invalid opaque FFT datatype pointer.");
741 if( (fft
= malloc(sizeof(struct gmx_fft
))) == NULL
)
746 commSize
= (nx
*ny
*nz
+4*(nx
+ny
+nz
)+300)*sizeof( acmlComplex
);
748 // Allocate communication work array
749 if( (comm
= (acmlComplex
*)malloc( commSize
) ) == NULL
)
754 ACML_FFT3DX( 100, 1.0f
, TRUE
, nx
, ny
, nz
, NULL
, 1, nx
, nx
*ny
, NULL
, 1, nx
, nx
*ny
, comm
, commSize
, &info
);
768 gmx_fft_init_3d_real(gmx_fft_t
* pfft
,
772 enum gmx_fft_flag flags
)
776 acmlComplex
* commX
= NULL
;
777 acmlComplex
* commY
= NULL
;
784 gmx_fatal(FARGS
,"Invalid opaque FFT datatype pointer.");
789 /* nzc = (nz/2 + 1); */
791 if( (fft
= malloc(sizeof(struct gmx_fft
))) == NULL
)
796 /* Roll our own 3D real transform using multiple transforms in ACML,
797 * since the current ACML versions does not support 2D
798 * or 3D real transforms.
802 * ny*nz complex-to-complex transforms, length nx
803 * transform distance: 1
804 * element strides: ny*nz
807 /* Single precision requires 5*nx+100
808 Double precision requires 3*nx+100
810 if( sizeof( acmlComplex
) == 16 )
811 commSize
= (3*nx
+100)*sizeof( acmlComplex
);
813 commSize
= (5*nx
+100)*sizeof( acmlComplex
);
815 /* Allocate communication work array */
816 if( (commX
= (acmlComplex
*)malloc( commSize
) ) == NULL
)
821 /* Initialize communication work array */
822 ACML_FFT1DMX( 100, 1.0f
, TRUE
, ny
*nz
, nx
, NULL
, ny
*nz
, 1, NULL
, ny
*nz
, 1, commX
, &info
);
826 gmx_fatal(FARGS
,"Error initializing ACML FFT; status=%d", info
);
827 gmx_fft_destroy( fft
);
832 * We cannot do all NX*NZ transforms at once, so define a handle to do
833 * NZ transforms, and then execute it NX times.
834 * nz complex-to-complex transforms, length ny
835 * transform distance: 1
836 * element strides: nz
838 /* Single precision requires 5*nx+100
839 Double precision requires 3*nx+100
841 if( sizeof( acmlComplex
) == 16 )
842 commSize
= (3*ny
+100)*sizeof( acmlComplex
);
844 commSize
= (5*ny
+100)*sizeof( acmlComplex
);
846 /* Allocate communication work array */
847 if( (commY
= (acmlComplex
*)malloc( commSize
) ) == NULL
)
852 /* Initialize communication work array */
853 /* We want to do multiple 1D FFT's in z-y plane, so we have to loop over x
854 * dimension recalculating z-y plane for each slice.
856 ACML_FFT1DMX( 100, 1.0f
, TRUE
, nz
, ny
, NULL
, nz
, 1, NULL
, nz
, 1, commY
, &info
);
860 gmx_fatal(FARGS
,"Error initializing ACML FFT; status=%d", info
);
861 gmx_fft_destroy( fft
);
866 * nx*ny real-to-complex transforms, length nz
867 * transform distance: nzc*2 -> nzc*2
871 commSize
= (3*nz
+100)*sizeof( real
);
872 /* Allocate communication work array */
873 if( (commRC
= (real
*)malloc( commSize
) ) == NULL
)
878 /* TODO: Is there no MODE or PLAN for multiple hermetian sequences? */
879 // Initialize communication work array
880 ACML_RCFFT1D( 100, nz
, NULL
, commRC
, &info
);
885 gmx_fatal(FARGS
,"Error initializing ACML FFT; status=%d", info
);
886 gmx_fft_destroy( fft
);
890 /* Out-of-place complex-to-real (affects distance) Z FFT:
891 * nx*ny real-to-complex transforms, length nz
892 * transform distance: nzc*2 -> nz
895 commSize
= (3*nz
+100)*sizeof( real
);
896 /* Allocate communication work array */
897 if( (commCR
= (real
*)malloc( commSize
) ) == NULL
)
902 // Initialize communication work array
903 ACML_CRFFT1D( 100, nz
, NULL
, commCR
, &info
);
907 gmx_fatal(FARGS
,"Error initializing ACML FFT; status=%d", info
);
908 gmx_fft_destroy( fft
);
912 /* Allocate scratch work array that ACML uses to splat from hermitian complex format to
913 * full complex format */
914 if( (fft
->realScratch
= (acmlComplex
*)malloc( (nx
*ny
*nz
)*sizeof( acmlComplex
) ) ) == NULL
)
924 fft
->comm
[0] = commX
;
925 fft
->comm
[1] = commY
;
926 fft
->comm
[2] = commRC
;
927 fft
->comm
[3] = commCR
;
934 gmx_fft_1d(gmx_fft_t fft
,
935 enum gmx_fft_direction dir
,
939 int inpl
= (in_data
== out_data
);
941 int mode
= (dir
== GMX_FFT_FORWARD
) ? -1 : 1;
943 if( (fft
->real_fft
== 1) || (fft
->ndim
!= 1) ||
944 ((dir
!= GMX_FFT_FORWARD
) && (dir
!= GMX_FFT_BACKWARD
)) )
946 gmx_fatal(FARGS
,"FFT plan mismatch - bad plan or direction.");
950 ACML_FFT1DX( mode
, 1.0f
, inpl
, fft
->nx
, in_data
, 1, out_data
, 1, fft
->comm
[0], &info
);
954 gmx_fatal(FARGS
,"Error executing AMD ACML FFT.");
962 gmx_fft_1d_real(gmx_fft_t fft
,
963 enum gmx_fft_direction dir
,
970 if( (fft
->real_fft
!= 1) || (fft
->ndim
!= 1) ||
971 ((dir
!= GMX_FFT_REAL_TO_COMPLEX
) && (dir
!= GMX_FFT_COMPLEX_TO_REAL
)) )
973 gmx_fatal(FARGS
,"FFT plan mismatch - bad plan or direction.");
977 /* I apply a correction scale to the automatic scaling that was done in the real-complex step */
978 float recipCorrection
= sqrtf( nx
);
980 /* ACML needs to do complex2complex intermediate transforms, which will not fit in the amount
981 * of memory allocated by the gromacs program, which assumes real. The real2complex transforms
982 * are also only in-place, so we manually do a memcpy() first */
983 if(dir
==GMX_FFT_REAL_TO_COMPLEX
)
985 memcpy( fft
->realScratch
, inPtr
, nx
*sizeof( real
) );
986 ACML_RCFFT1D( 1, nx
, fft
->realScratch
, fft
->comm
[0], &info
);
988 hermitianUnPacking( recipCorrection
, 1, nx
, fft
->realScratch
, 0, outPtr
);
992 memcpy( fft
->realScratch
, inPtr
, nx
*sizeof( t_complex
) );
993 hermitianPacking( recipCorrection
, 1, nx
, fft
->realScratch
, outPtr
, 0 );
995 ACML_CRFFT1D( 1, nx
, outPtr
, fft
->comm
[1], &info
);
1000 gmx_fatal(FARGS
,"Error executing AMD ACML FFT.");
1008 gmx_fft_2d(gmx_fft_t fft
,
1009 enum gmx_fft_direction dir
,
1013 int inpl
= (in_data
== out_data
);
1015 int mode
= (dir
== GMX_FFT_FORWARD
) ? -1 : 1;
1017 if( (fft
->real_fft
== 1) || (fft
->ndim
!= 2) ||
1018 ((dir
!= GMX_FFT_FORWARD
) && (dir
!= GMX_FFT_BACKWARD
)) )
1020 gmx_fatal(FARGS
,"FFT plan mismatch - bad plan or direction.");
1024 ACML_FFT2DX( mode
, 1.0f
, TRUE
, inpl
, fft
->nx
, fft
->ny
,
1025 in_data
, 1, fft
->nx
, out_data
, 1, fft
->nx
, fft
->comm
[0], &info
);
1029 gmx_fatal(FARGS
,"Error executing AMD ACML FFT.");
1038 gmx_fft_2d_real(gmx_fft_t fft
,
1039 enum gmx_fft_direction dir
,
1043 int inpl
= (inPtr
== outPtr
);
1049 int nyc
= (fft
->ny
/2 + 1);
1052 /* Depending on whether we are calculating the FFT in place or not, the rows of the
1053 * 2D FFT will be packed or not.
1057 /* Not packed; 1 or 2 extra reals padding at end of each row */
1065 if( (fft
->real_fft
!= 1) || (fft
->ndim
!= 2) ||
1066 ((dir
!= GMX_FFT_REAL_TO_COMPLEX
) && (dir
!= GMX_FFT_COMPLEX_TO_REAL
)) )
1068 gmx_fatal(FARGS
,"FFT plan mismatch - bad plan or direction.");
1072 /* I apply a correction scale to the automatic scaling that was done in the real-complex step */
1073 float recipCorrection
= sqrtf( ny
);
1075 if(dir
==GMX_FFT_REAL_TO_COMPLEX
)
1077 /* ACML needs to do complex2complex intermediate transforms, which will not fit in the amount
1078 * of memory allocated by the gromacs program, which assumes real. The real2complex transforms
1079 * are also only in-place, so we manually do a memcpy() first */
1082 memcpy( outPtr
, inPtr
, nx
*ny
*sizeof( real
) );
1085 /* real-to-complex in Y dimension, in-place
1086 * SCFFTM is not valid to call here. SCFFTM does not take any stride information, and assumes that
1087 * the rows are tightly packed. GROMACS pads rows with either 1 or 2 extra reals depending
1088 * on even or odd lengths.
1090 for( i
= 0; i
< nx
; ++i
)
1093 ACML_RCFFT1D( 1, ny
, (real
*)outPtr
+i
*nyLen
, fft
->comm
[1], &info
);
1096 hermitianUnPacking( 1.0f
, nx
, ny
, outPtr
, nyLen
, fft
->realScratch
);
1098 /* complex-to-complex in X dimension */
1100 ACML_FFT1DMX( -1, recipCorrection
, FALSE
, nyc
, nx
, fft
->realScratch
, nyc
, 1,
1101 outPtr
, nyc
, 1, fft
->comm
[0], &info
);
1105 /* complex-to-complex in X dimension, in-place */
1106 ACML_FFT1DMX( 1, recipCorrection
, FALSE
, nyc
, nx
, inPtr
, nyc
, 1,
1107 fft
->realScratch
, nyc
, 1, fft
->comm
[0], &info
);
1109 hermitianPacking( 1.0f
, nx
, ny
, fft
->realScratch
, outPtr
, nyLen
);
1111 /* complex-to-real in Y dimension
1112 * CSFFTM is not valid to call here. SCFFTM does not take any stride information, and assumes that
1113 * the rows are tightly packed. GROMACS pads rows with either 1 or 2 extra reals depending
1114 * on even or odd lengths.
1118 for( i
= 0; i
< nx
; ++i
)
1121 ACML_CRFFT1D( 1, ny
, (real
*)outPtr
+i
*nyLen
, fft
->comm
[2], &info
);
1128 gmx_fatal(FARGS
,"Error executing AMD ACML FFT.");
1137 gmx_fft_3d(gmx_fft_t fft
,
1138 enum gmx_fft_direction dir
,
1142 int mode
= (dir
== GMX_FFT_FORWARD
) ? -1 : 1;
1143 int inpl
= ( in_data
== out_data
);
1151 if( (fft
->real_fft
== 1) || (fft
->ndim
!= 3) ||
1152 ((dir
!= GMX_FFT_FORWARD
) && (dir
!= GMX_FFT_BACKWARD
)) )
1154 gmx_fatal(FARGS
,"FFT plan mismatch - bad plan or direction.");
1158 commSize
= (nx
*ny
*nz
+4*(nx
+ny
+nz
)+300)*sizeof( acmlComplex
);
1160 ACML_FFT3DX( mode
, 1.0f
, inpl
, nx
, ny
, nz
, in_data
, 1, nx
, nx
*ny
,
1161 out_data
, 1, nx
, nx
*ny
, fft
->comm
[0], commSize
, &info
);
1165 gmx_fatal(FARGS
,"Error executing AMD ACML FFT.");
1173 gmx_fft_3d_real(gmx_fft_t fft
,
1174 enum gmx_fft_direction dir
,
1178 int inpl
= (inPtr
== outPtr
);
1181 int nx
,ny
,nz
, nzLen
= 0;
1186 int nzc
= (nz
/2 + 1);
1188 /* Depending on whether we are calculating the FFT in place or not, the rows of the
1189 * 3D FFT will be packed or not.
1193 /* Not packed; 1 or 2 extra reals padding at end of each row */
1201 if( (fft
->real_fft
!= 1) || (fft
->ndim
!= 3) ||
1202 ((dir
!= GMX_FFT_REAL_TO_COMPLEX
) && (dir
!= GMX_FFT_COMPLEX_TO_REAL
)) )
1204 gmx_fatal(FARGS
,"FFT plan mismatch - bad plan or direction.");
1208 /* I apply a correction scale to the automatic scaling that was done in the real-complex step */
1209 float recipCorrection
= sqrtf( nz
);
1211 if(dir
==GMX_FFT_REAL_TO_COMPLEX
)
1213 /* ACML needs to do complex2complex intermediate transforms, which will not fit in the amount
1214 * of memory allocated by the gromacs program, which assumes real. The real2complex transforms
1215 * are also only in-place, so we manually do a memcpy() first */
1218 memcpy( outPtr
, inPtr
, nx
*ny
*nz
*sizeof( real
) );
1221 /* real-to-complex in Z dimension, in-place
1222 * SCFFTM is not valid to call here. SCFFTM does not take any stride information, and assumes that
1223 * the rows are tightly packed. GROMACS pads rows with either 1 or 2 extra reals depending
1224 * on even or odd lengths.
1226 for( i
= 0; i
< nx
*ny
; ++i
)
1229 ACML_RCFFT1D( 1, nz
, (real
*)outPtr
+i
*nzLen
, fft
->comm
[2], &info
);
1232 hermitianUnPacking( 1.0f
, nx
*ny
, nz
, outPtr
, nzLen
, fft
->realScratch
);
1234 /* complex-to-complex in Y dimension, in-place */
1238 ACML_FFT1DMX( -1, 1.0f
, TRUE
, nzc
, ny
, (acmlComplex
*)fft
->realScratch
+i
*ny
*nzc
, nzc
, 1,
1239 (acmlComplex
*)fft
->realScratch
+i
*ny
*nzc
, nzc
, 1, fft
->comm
[1], &info
);
1242 /* complex-to-complex in X dimension, in-place */
1244 ACML_FFT1DMX( -1, recipCorrection
, FALSE
, ny
*nzc
, nx
, fft
->realScratch
, ny
*nzc
, 1, outPtr
, ny
*nzc
, 1, fft
->comm
[0], &info
);
1248 /* complex-to-complex in X dimension, from inPtr to work */
1249 ACML_FFT1DMX( 1, recipCorrection
, FALSE
, ny
*nzc
, nx
, inPtr
, ny
*nzc
, 1, fft
->realScratch
, ny
*nzc
, 1, fft
->comm
[0], &info
);
1251 /* complex-to-complex in Y dimension, in-place */
1252 for( i
=0; i
< nx
; i
++ )
1255 ACML_FFT1DMX( 1, 1.0f
, TRUE
, nzc
, ny
, (acmlComplex
*)fft
->realScratch
+i
*ny
*nzc
, nzc
, 1,
1256 (acmlComplex
*)fft
->realScratch
+i
*ny
*nzc
, nzc
, 1, fft
->comm
[1], &info
);
1259 hermitianPacking( 1.0f
, nx
*ny
, nz
, fft
->realScratch
, outPtr
, nzLen
);
1261 /* complex-to-real in Z dimension, in-place
1262 * CSFFTM is not valid to call here. CSFFTM does not take any stride information, and assumes that
1263 * the rows are tightly packed. GROMACS pads rows with either 1 or 2 extra reals depending
1264 * on even or odd lengths.
1266 for( i
= 0; i
< nx
*ny
; ++i
)
1269 ACML_CRFFT1D( 1, nz
, (real
*)outPtr
+i
*nzLen
, fft
->comm
[3], &info
);
1276 gmx_fatal(FARGS
,"Error executing AMD ACML FFT.");
1284 gmx_fft_destroy(gmx_fft_t fft
)
1292 if(fft
->comm
[d
] != NULL
)
1298 if( fft
->realScratch
!= NULL
)
1299 free( fft
->realScratch
);
1308 #endif /* GMX_FFT_ACML */