Fixed include statements such that double precision version of genborn.c
[gromacs/rigid-bodies.git] / src / mdlib / gmx_fft_acml.c
blob160e4c0600fc1091a4c96f0fa72342522e917684
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * Gromacs 4.0 Copyright (c) 1991-2003
5 * David van der Spoel, Erik Lindahl, University of Groningen.
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
12 * To help us fund GROMACS development, we humbly ask that you cite
13 * the research papers on the package. Check out http://www.gromacs.org
15 * And Hey:
16 * Grim Ragnarok Overthrows Midgard Amidst Cursing Silence
18 #ifdef HAVE_CONFIG_H
19 #include <config.h>
20 #endif
22 #ifdef GMX_FFT_ACML
24 #include <errno.h>
25 #include <stdlib.h>
26 #include <memory.h>
27 #include <math.h>
28 #include <assert.h>
30 #include "acml.h"
33 #include "gmx_fft.h"
34 #include "gmx_fatal.h"
36 /* Since c has no support of templates, we get around the type restrictions
37 with macros. Our custom macro names obscure the float vs double interfaces
39 #ifdef GMX_DOUBLE
40 #define ACML_FFT1DX zfft1dx
41 #define ACML_FFT2DX zfft2dx
42 #define ACML_FFT3DX zfft3dy
43 #define ACML_FFT1DMX zfft1mx
45 #define ACML_RCFFT1D dzfft
46 #define ACML_CRFFT1D zdfft
47 #define ACML_RCFFT1DM dzfftm
48 #define ACML_CRFFT1DM zdfftm
50 #define acmlComplex doublecomplex
51 #else
53 #define ACML_FFT1DX cfft1dx
54 #define ACML_FFT2DX cfft2dx
55 #define ACML_FFT3DX cfft3dy
56 #define ACML_FFT1DMX cfft1mx
58 #define ACML_RCFFT1D scfft
59 #define ACML_CRFFT1D csfft
60 #define ACML_RCFFT1DM scfftm
61 #define ACML_CRFFT1DM csfftm
63 #define acmlComplex complex
64 #endif
66 void cPrintArray( t_complex* arr, int outer, int mid, int inner )
68 int i, j, k;
70 printf("\n");
71 for( i = 0; i < outer; ++i )
73 for( j = 0; j < mid; ++j )
75 for( k = 0; k < inner; ++k )
77 printf("%f,%f ", arr[i*inner*mid+j*inner+k].re, arr[i*inner*mid+j*inner+k].im );
79 printf("\n");
81 printf("\n");
84 return;
87 void rPrintArray( real* arr, int outer, int mid, int inner )
89 int i, j, k;
91 printf("\n");
92 for( i = 0; i < outer; ++i )
94 for( j = 0; j < mid; ++j )
96 for( k = 0; k < inner; ++k )
98 printf("%f ", arr[i*inner*mid+j*inner+k] );
100 printf("\n");
102 printf("\n");
105 return;
108 /* Contents of the AMD ACML FFT fft datatype.
110 * Note that this is one of several possible implementations of gmx_fft_t.
112 * The ACML _API_ supports 1D,2D, and 3D transforms, including real-to-complex.
113 * Unfortunately the actual library implementation does not support 2D&3D real
114 * transforms as of version 4.2 In addition, ACML outputs their hermitian data
115 * in such a form that it can't be fed straight back into their complex API's.
117 * To work around this we roll our own 2D and 3D real-to-complex transforms,
118 * using separate X/Y/Z handles defined to perform (ny*nz), (nx*nz), and
119 * (nx*ny) transforms at once when necessary. To perform strided multiple
120 * transforms out-of-place (i.e., without padding in the last dimension)
121 * on the fly we also need to separate the forward and backward
122 * handles for real-to-complex/complex-to-real data permutation.
124 * So, the handles are enumerated as follows:
126 * 1D FFT (real too): Index 0 is the handle for the entire FFT
127 * 2D complex FFT: Index 0 is the handle for the entire FFT
128 * 3D complex FFT: Index 0 is the handle for the entire FFT
129 * 2D, real FFT: 0=FFTx, 1=FFTrc, 2=FFTcr handle
130 * 3D, real FFT: 0=FFTx, 1=FFTy, 2=FFTrc, 3=FFTcr handle
132 * AMD people reading this: Learn from FFTW what a good interface looks like :-)
135 struct gmx_fft
137 /* Work arrays;
138 * c2c = index0
139 * r2c = index0, c2r = index1
141 void* comm[4];
142 void* realScratch;
144 int ndim; /**< Number of dimensions in FFT */
145 int nx; /**< Length of X transform */
146 int ny; /**< Length of Y transform */
147 int nz; /**< Length of Z transform */
148 int real_fft; /**< 1 if real FFT, otherwise 0 */
151 /* ACML's real FFT support leaves the data in a swizzled 'hermitian' format. The
152 problem with this is that you can't just feed this data back into ACML :( A
153 pre-processing step is required to transform the hermitian swizzled complex
154 values into unswizzled complex values
155 -This function assumes that the complex conjugates are not expanded; the
156 calling function should not assume that they exist.
157 -This function pads all real numbers with 0's for imaginary components.
159 void hermitianUnPacking( float scale, int row, int col, void* src, int srcPitch, void* dst )
161 int mid = col/2;
162 int loopCount = (col-1)/2;
163 int hermLength = mid + 1;
165 int nFFT;
167 /* Currently this function is not written to handle aliased input/output pointers */
168 assert( src != dst );
170 /* These two pointers could be aliased on top of each other; be careful */
171 real* realData = (real*)src;
172 t_complex* hermData = (t_complex*)dst;
174 /* We have to expand the data from real to complex, which will clobber data if you start
175 * from the beginning. For this reason, we start at the last array and work backwards,
176 * however, we skip the very first row (nFFT == 0) because we still have data collision
177 * on that row. We copy the first row into a temporary buffer.
179 for( nFFT = row-1; nFFT >= 0; --nFFT )
181 realData = (real*)src + (nFFT*srcPitch);
182 hermData = (t_complex*)dst + (nFFT*hermLength);
184 /* The first complex number is real valued */
185 t_complex tmp = { realData[0]*scale, 0 };
186 hermData[0] = tmp;
188 int i;
189 for(i = 1; i <= loopCount; ++i)
191 t_complex tmp = { realData[i]*scale, realData[col-i]*scale };
192 hermData[i] = tmp;
195 /* A little cleanup for even lengths */
196 if( loopCount != mid )
198 /* The n/2 number is real valued */
199 t_complex tmp = { realData[mid]*scale, 0 };
200 hermData[mid] = tmp;
204 return;
207 /* ACML's real FFT support requires the data in a swizzled 'hermitian' format.
208 This is a non-standard format, and requires us to manually swizzle the data :(
209 A pre-processing step is required to transform the complex values into the
210 swizzled hermitian format
212 void hermitianPacking( float scale, int row, int col, void* src, void* dst, int dstPitch )
214 int mid = col/2;
215 int loopCount = (col-1)/2;
216 int hermLength = mid + 1;
218 int nFFT;
220 real* realData;
221 t_complex* hermData;
223 /* Currently this function is not written to handle aliased input/output pointers */
224 assert( src != dst );
226 for( nFFT = 0; nFFT < row; ++nFFT )
228 realData = (real*)dst + (nFFT*dstPitch);
229 hermData = (t_complex*)src + (nFFT*hermLength);
231 /* The first complex number is real valued */
232 realData[0] = hermData[0].re * scale;
234 /* ACML's complex to real function is documented as only handling 'forward'
235 * transforms, which mean we have to manually conjugate the data before doing
236 * the backwards transform. */
237 int i;
238 for(i = 1; i <= loopCount; ++i)
240 realData[i] = hermData[i].re * scale;
241 realData[col-i] = -hermData[i].im * scale;
244 /* A little cleanup for even lengths */
245 if( loopCount != mid )
247 /* The n/2 number is real valued */
248 realData[mid] = hermData[mid].re * scale;
253 return;
256 /* Gromacs expects results that are returned to be in a packed form; No
257 complex conjugate's. This routine takes a 2D array of complex data,
258 and compresses it to fit into 'real' form by removing all the complex
259 conjugates and any known imaginary 0's. Data is expected in row-major
260 form, with ny describing the length of a row.
262 void compressConj2D( int nX, int nY, void* src, void* dst )
264 /* These two pointers are aliased on top of each other; be careful */
265 real* realData = (real*)dst;
266 t_complex* complexData;
268 int halfRows = nX/2 + 1;
269 int halfCols = nY/2 + 1;
270 int rowOdd = nX & 0x1;
271 int colOdd = nY & 0x1;
274 int nRow;
275 for( nRow = 0; nRow < nX; ++nRow )
277 complexData = (t_complex*)src + (nRow*halfCols);
279 int nCol;
280 for( nCol = 0; nCol < halfCols; ++nCol )
282 if(( nRow == 0) && (nCol == 0 ))
284 /* The complex number is real valued */
285 realData[0] = complexData[nCol].re;
286 realData += 1;
288 /* Last column is real if even column length */
289 else if( (nRow == 0) && ( !colOdd && ( nCol == (halfCols-1) ) ) )
291 /* The complex number is real valued */
292 realData[0] = complexData[nCol].re;
293 realData += 1;
295 /* The middle value of the first column is real if we have an even number of rows and
296 * the last column of middle row is real if we have an even number of rows and columns */
297 else if( !rowOdd && ( ( nRow == (halfRows-1) ) && ( ( nCol == 0 ) || ( !colOdd && ( nCol == (halfCols-1) ) ) ) ) )
299 /* The complex number is real valued */
300 realData[0] = complexData[nCol].re;
301 realData += 1;
303 else if( (nCol == 0) || ( !colOdd && ( nCol == (halfCols-1) ) ) )
305 /* Last half of real columns are conjugates */
306 if( nRow < halfRows )
308 /* Copy a complex number, which is two reals */
309 realData[0] = complexData[nCol].re;
310 realData[1] = complexData[nCol].im;
311 realData += 2;
314 else
316 /* Copy a complex number, which is two reals */
317 realData[0] = complexData[nCol].re;
318 realData[1] = complexData[nCol].im;
319 realData += 2;
324 return;
327 /* Gromacs expects results that are returned to be in a packed form; No
328 complex conjugate's. This routine takes a 2D array of real data,
329 and uncompresses it to fit into 'complex' form by creating all the complex
330 conjugates and any known imaginary 0's. Data is expected in row-major
331 form, with ny describing the length of a row.
333 void unCompressConj2D( int nX, int nY, void* src, void* dst )
335 /* These two pointers are aliased on top of each other; be careful */
336 real* realData = (real*)src;
337 t_complex* complexData = (t_complex*)dst;
339 int halfRows = nX/2 + 1;
340 int halfCols = nY/2 + 1;
341 int rowOdd = nX & 0x1;
342 int colOdd = nY & 0x1;
344 int nRow;
345 for( nRow = 0; nRow < nX; ++nRow )
347 int nCol;
348 for( nCol = 0; nCol < halfCols; ++nCol )
350 int iIndex = nRow*halfCols + nCol;
352 if(( nRow == 0) && (nCol == 0 ))
354 /* The complex number is real valued */
355 complexData[iIndex].re = realData[0];
356 complexData[iIndex].im = 0;
357 realData += 1;
359 /* Last column is real if even column length */
360 else if( (nRow == 0) && ( !colOdd && ( nCol == (halfCols-1) ) ) )
362 /* The complex number is real valued */
363 complexData[iIndex].re = realData[0];
364 complexData[iIndex].im = 0;
365 realData += 1;
367 /* The middle value of the first column is real if we have an even number of rows and
368 * the last column of middle row is real if we have an even number of rows and columns */
369 else if( !rowOdd && ( ( nRow == (halfRows-1) ) && ( ( nCol == 0 ) || ( !colOdd && ( nCol == (halfCols-1) ) ) ) ) )
371 /* The complex number is real valued */
372 complexData[iIndex].re = realData[0];
373 complexData[iIndex].im = 0;
374 realData += 1;
376 else if( (nCol == 0) || (!colOdd && ( nCol == (halfCols-1) ) ) )
378 /* Last half of real columns are conjugates */
379 if( nRow < halfRows )
381 /* Copy a complex number, which is two reals */
382 complexData[iIndex].re = realData[0];
383 complexData[iIndex].im = realData[1];
384 realData += 2;
386 else
388 int oIndex = (nX-nRow)*halfCols + nCol;
389 complexData[iIndex].re = complexData[oIndex].re;
390 complexData[iIndex].im = -complexData[oIndex].im;
393 else
395 /* Copy a complex number, which is two reals */
396 complexData[iIndex].re = realData[0];
397 complexData[iIndex].im = realData[1];
398 realData += 2;
403 return;
406 /* Support routine for the 1D array tranforms. ACML does not support a MODE
407 * flag on the real/complex interface. It assumes a 'forward' transform both
408 * directions, so that requires a manual conjugation of the imaginary comps. */
409 void negateConj( int len, void* src )
411 real* imag = (real*)src;
412 int half = len/2 + 1;
413 int i;
415 for( i = half; i < len; ++i)
417 imag[i] = -imag[i];
420 return;
424 gmx_fft_init_1d(gmx_fft_t * pfft,
425 int nx,
426 enum gmx_fft_flag flags)
428 gmx_fft_t fft;
429 int info = 0;
430 acmlComplex* comm = NULL;
431 int commSize = 0;
434 if(pfft==NULL)
436 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
437 return EINVAL;
439 *pfft = NULL;
441 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
443 return ENOMEM;
446 // Single precision requires 5*nx+100
447 // Double precision requires 3*nx+100
448 if( sizeof( acmlComplex ) == 16 )
449 commSize = (3*nx+100)*sizeof( acmlComplex );
450 else
451 commSize = (5*nx+100)*sizeof( acmlComplex );
453 // Allocate communication work array
454 if( (comm = (acmlComplex*)malloc( commSize ) ) == NULL )
456 return ENOMEM;
459 // Initialize communication work array
460 ACML_FFT1DX( 100, 1.0f, TRUE, nx, NULL, 1, NULL, 1, comm, &info );
462 if( info != 0 )
464 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
465 gmx_fft_destroy( fft );
466 return info;
469 fft->ndim = 1;
470 fft->nx = nx;
471 fft->real_fft = 0;
472 fft->comm[0] = comm;
474 *pfft = fft;
475 return 0;
479 gmx_fft_init_1d_real(gmx_fft_t * pfft,
480 int nx,
481 enum gmx_fft_flag flags)
483 gmx_fft_t fft;
484 int info = 0;
485 real* commRC = NULL;
486 real* commCR = NULL;
487 int commSize = 0;
489 if(pfft==NULL)
491 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
492 return EINVAL;
494 *pfft = NULL;
496 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
498 return ENOMEM;
502 commSize = (3*nx+100)*sizeof( float );
504 // Allocate communication work array, r2c
505 if( (commRC = (real*)malloc( commSize ) ) == NULL )
507 return ENOMEM;
510 // Allocate communication work array, c2r
511 if( (commCR = (real*)malloc( commSize ) ) == NULL )
513 return ENOMEM;
516 // Initialize communication work array
517 ACML_RCFFT1D( 100, nx, NULL, (real*)commRC, &info );
519 if( info != 0 )
521 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
522 gmx_fft_destroy( fft );
523 return info;
526 // Initialize communication work array
527 ACML_CRFFT1D( 100, nx, NULL, (real*)commCR, &info );
529 if( info != 0 )
531 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
532 gmx_fft_destroy( fft );
533 return info;
536 /* Allocate scratch work array that ACML uses to splat from hermitian complex format to
537 * full complex format */
538 if( (fft->realScratch = (acmlComplex*)malloc( nx*sizeof( acmlComplex ) ) ) == NULL )
540 return ENOMEM;
543 fft->ndim = 1;
544 fft->nx = nx;
545 fft->real_fft = 1;
546 fft->comm[0] = commRC;
547 fft->comm[1] = commCR;
549 *pfft = fft;
550 return 0;
554 gmx_fft_init_2d(gmx_fft_t * pfft,
555 int nx,
556 int ny,
557 enum gmx_fft_flag flags)
559 gmx_fft_t fft;
560 int info = 0;
561 acmlComplex* comm = NULL;
562 int commSize = 0;
565 if(pfft==NULL)
567 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
568 return EINVAL;
570 *pfft = NULL;
572 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
574 return ENOMEM;
577 // Single precision requires nx*ny+5*(nx+ny)
578 // Double precision requires nx*ny+3*(nx+ny)
579 if( sizeof( acmlComplex ) == 16 )
580 commSize = (nx*ny+3*(nx+ny)+200)*sizeof( acmlComplex );
581 else
582 commSize = (nx*ny+5*(nx+ny)+200)*sizeof( acmlComplex );
584 // Allocate communication work array
585 if( (comm = (acmlComplex*)malloc( commSize ) ) == NULL )
587 return ENOMEM;
590 // Initialize communication work array
591 ACML_FFT2DX( 100, 1.0f, TRUE, TRUE, nx, ny, NULL, 1, nx, NULL, 1, nx, (acmlComplex*)comm, &info );
593 if( info != 0 )
595 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
596 gmx_fft_destroy( fft );
597 return info;
600 fft->ndim = 2;
601 fft->nx = nx;
602 fft->ny = ny;
603 fft->real_fft = 0;
604 fft->comm[0] = comm;
606 *pfft = fft;
607 return 0;
611 gmx_fft_init_2d_real(gmx_fft_t * pfft,
612 int nx,
613 int ny,
614 enum gmx_fft_flag flags)
616 gmx_fft_t fft;
617 int info = 0;
618 acmlComplex* comm = NULL;
619 real* commRC = NULL;
620 real* commCR = NULL;
621 int commSize = 0;
622 int nyc = 0;
624 if(pfft==NULL)
626 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
627 return EINVAL;
629 *pfft = NULL;
631 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
633 return ENOMEM;
636 nyc = (ny/2 + 1);
638 /* Roll our own 2D real transform using multiple transforms in ACML,
639 * since the current ACML versions does not support our storage format,
640 * and all but the most recent don't even have 2D real FFTs.
643 // Single precision requires 5*nx+100
644 // Double precision requires 3*nx+100
645 if( sizeof( acmlComplex ) == 16 )
646 commSize = (3*nx+100)*sizeof( acmlComplex );
647 else
648 commSize = (5*nx+100)*sizeof( acmlComplex );
650 // Allocate communication work array
651 if( (comm = (acmlComplex*)malloc( commSize ) ) == NULL )
653 return ENOMEM;
656 // Initialize communication work array
657 ACML_FFT1DMX( 100, 1.0f, FALSE, nyc, nx, NULL, nyc, 1, NULL, nyc, 1, comm, &info );
659 if( info != 0 )
661 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
662 gmx_fft_destroy( fft );
663 return info;
666 commSize = (3*ny+100)*sizeof( real );
667 // Allocate communication work array
668 if( (commRC = (real*)malloc( commSize ) ) == NULL )
670 return ENOMEM;
673 // TODO: Is there no MODE or PLAN for multiple hermetian sequences?
674 // Initialize communication work array
675 ACML_RCFFT1D( 100, ny, NULL, commRC, &info );
677 if( info != 0 )
679 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
680 gmx_fft_destroy( fft );
681 return info;
684 commSize = (3*ny+100)*sizeof( real );
685 // Allocate communication work array
686 if( (commCR = (real*)malloc( commSize ) ) == NULL )
688 return ENOMEM;
691 // TODO: Is there no MODE or PLAN for multiple hermetian sequences?
692 // Initialize communication work array
693 ACML_CRFFT1D( 100, ny, NULL, commCR, &info );
695 if( info != 0 )
697 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
698 gmx_fft_destroy( fft );
699 return info;
702 /* Allocate scratch work array that ACML uses to splat from hermitian complex format to
703 * full complex format */
704 if( (fft->realScratch = (acmlComplex*)malloc( (nx*ny)*sizeof( acmlComplex ) ) ) == NULL )
706 return ENOMEM;
709 fft->ndim = 2;
710 fft->nx = nx;
711 fft->ny = ny;
712 fft->real_fft = 1;
713 fft->comm[0] = comm;
714 fft->comm[1] = commRC;
715 fft->comm[2] = commCR;
717 *pfft = fft;
718 return 0;
722 gmx_fft_init_3d(gmx_fft_t * pfft,
723 int nx,
724 int ny,
725 int nz,
726 enum gmx_fft_flag flags)
728 gmx_fft_t fft;
729 int info = 0;
730 acmlComplex* comm = NULL;
731 int commSize = 0;
734 if(pfft==NULL)
736 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
737 return EINVAL;
739 *pfft = NULL;
741 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
743 return ENOMEM;
746 commSize = (nx*ny*nz+4*(nx+ny+nz)+300)*sizeof( acmlComplex );
748 // Allocate communication work array
749 if( (comm = (acmlComplex*)malloc( commSize ) ) == NULL )
751 return ENOMEM;
754 ACML_FFT3DX( 100, 1.0f, TRUE, nx, ny, nz, NULL, 1, nx, nx*ny, NULL, 1, nx, nx*ny, comm, commSize, &info );
756 fft->ndim = 3;
757 fft->nx = nx;
758 fft->ny = ny;
759 fft->nz = nz;
760 fft->real_fft = 0;
761 fft->comm[0] = comm;
763 *pfft = fft;
764 return 0;
768 gmx_fft_init_3d_real(gmx_fft_t * pfft,
769 int nx,
770 int ny,
771 int nz,
772 enum gmx_fft_flag flags)
774 gmx_fft_t fft;
775 int info = 0;
776 acmlComplex* commX = NULL;
777 acmlComplex* commY = NULL;
778 real* commRC = NULL;
779 real* commCR = NULL;
780 int commSize = 0;
782 if(pfft==NULL)
784 gmx_fatal(FARGS,"Invalid opaque FFT datatype pointer.");
785 return EINVAL;
787 *pfft = NULL;
789 /* nzc = (nz/2 + 1); */
791 if( (fft = malloc(sizeof(struct gmx_fft))) == NULL)
793 return ENOMEM;
796 /* Roll our own 3D real transform using multiple transforms in ACML,
797 * since the current ACML versions does not support 2D
798 * or 3D real transforms.
801 /* In-place X FFT.
802 * ny*nz complex-to-complex transforms, length nx
803 * transform distance: 1
804 * element strides: ny*nz
807 /* Single precision requires 5*nx+100
808 Double precision requires 3*nx+100
810 if( sizeof( acmlComplex ) == 16 )
811 commSize = (3*nx+100)*sizeof( acmlComplex );
812 else
813 commSize = (5*nx+100)*sizeof( acmlComplex );
815 /* Allocate communication work array */
816 if( (commX = (acmlComplex*)malloc( commSize ) ) == NULL )
818 return ENOMEM;
821 /* Initialize communication work array */
822 ACML_FFT1DMX( 100, 1.0f, TRUE, ny*nz, nx, NULL, ny*nz, 1, NULL, ny*nz, 1, commX, &info );
824 if( info != 0 )
826 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
827 gmx_fft_destroy( fft );
828 return info;
831 /* In-place Y FFT.
832 * We cannot do all NX*NZ transforms at once, so define a handle to do
833 * NZ transforms, and then execute it NX times.
834 * nz complex-to-complex transforms, length ny
835 * transform distance: 1
836 * element strides: nz
838 /* Single precision requires 5*nx+100
839 Double precision requires 3*nx+100
841 if( sizeof( acmlComplex ) == 16 )
842 commSize = (3*ny+100)*sizeof( acmlComplex );
843 else
844 commSize = (5*ny+100)*sizeof( acmlComplex );
846 /* Allocate communication work array */
847 if( (commY = (acmlComplex*)malloc( commSize ) ) == NULL )
849 return ENOMEM;
852 /* Initialize communication work array */
853 /* We want to do multiple 1D FFT's in z-y plane, so we have to loop over x
854 * dimension recalculating z-y plane for each slice.
856 ACML_FFT1DMX( 100, 1.0f, TRUE, nz, ny, NULL, nz, 1, NULL, nz, 1, commY, &info );
858 if( info != 0 )
860 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
861 gmx_fft_destroy( fft );
862 return info;
865 /* In-place Z FFT:
866 * nx*ny real-to-complex transforms, length nz
867 * transform distance: nzc*2 -> nzc*2
868 * element strides: 1
871 commSize = (3*nz+100)*sizeof( real );
872 /* Allocate communication work array */
873 if( (commRC = (real*)malloc( commSize ) ) == NULL )
875 return ENOMEM;
878 /* TODO: Is there no MODE or PLAN for multiple hermetian sequences? */
879 // Initialize communication work array
880 ACML_RCFFT1D( 100, nz, NULL, commRC, &info );
883 if( info != 0 )
885 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
886 gmx_fft_destroy( fft );
887 return info;
890 /* Out-of-place complex-to-real (affects distance) Z FFT:
891 * nx*ny real-to-complex transforms, length nz
892 * transform distance: nzc*2 -> nz
893 * element STRIDES: 1
895 commSize = (3*nz+100)*sizeof( real );
896 /* Allocate communication work array */
897 if( (commCR = (real*)malloc( commSize ) ) == NULL )
899 return ENOMEM;
902 // Initialize communication work array
903 ACML_CRFFT1D( 100, nz, NULL, commCR, &info );
905 if( info != 0 )
907 gmx_fatal(FARGS,"Error initializing ACML FFT; status=%d", info);
908 gmx_fft_destroy( fft );
909 return info;
912 /* Allocate scratch work array that ACML uses to splat from hermitian complex format to
913 * full complex format */
914 if( (fft->realScratch = (acmlComplex*)malloc( (nx*ny*nz)*sizeof( acmlComplex ) ) ) == NULL )
916 return ENOMEM;
919 fft->ndim = 3;
920 fft->nx = nx;
921 fft->ny = ny;
922 fft->nz = nz;
923 fft->real_fft = 1;
924 fft->comm[0] = commX;
925 fft->comm[1] = commY;
926 fft->comm[2] = commRC;
927 fft->comm[3] = commCR;
929 *pfft = fft;
930 return 0;
934 gmx_fft_1d(gmx_fft_t fft,
935 enum gmx_fft_direction dir,
936 void * in_data,
937 void * out_data)
939 int inpl = (in_data == out_data);
940 int info = 0;
941 int mode = (dir == GMX_FFT_FORWARD) ? -1 : 1;
943 if( (fft->real_fft == 1) || (fft->ndim != 1) ||
944 ((dir != GMX_FFT_FORWARD) && (dir != GMX_FFT_BACKWARD)) )
946 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
947 return EINVAL;
950 ACML_FFT1DX( mode, 1.0f, inpl, fft->nx, in_data, 1, out_data, 1, fft->comm[0], &info );
952 if( info != 0 )
954 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
955 info = -1;
958 return info;
962 gmx_fft_1d_real(gmx_fft_t fft,
963 enum gmx_fft_direction dir,
964 void * inPtr,
965 void * outPtr)
967 int info = 0;
968 int nx = fft->nx;
970 if( (fft->real_fft != 1) || (fft->ndim != 1) ||
971 ((dir != GMX_FFT_REAL_TO_COMPLEX) && (dir != GMX_FFT_COMPLEX_TO_REAL)) )
973 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
974 return EINVAL;
977 /* I apply a correction scale to the automatic scaling that was done in the real-complex step */
978 float recipCorrection = sqrtf( nx );
980 /* ACML needs to do complex2complex intermediate transforms, which will not fit in the amount
981 * of memory allocated by the gromacs program, which assumes real. The real2complex transforms
982 * are also only in-place, so we manually do a memcpy() first */
983 if(dir==GMX_FFT_REAL_TO_COMPLEX)
985 memcpy( fft->realScratch, inPtr, nx*sizeof( real ) );
986 ACML_RCFFT1D( 1, nx, fft->realScratch, fft->comm[0], &info );
988 hermitianUnPacking( recipCorrection, 1, nx, fft->realScratch, 0, outPtr );
990 else
992 memcpy( fft->realScratch, inPtr, nx*sizeof( t_complex ) );
993 hermitianPacking( recipCorrection, 1, nx, fft->realScratch, outPtr, 0 );
995 ACML_CRFFT1D( 1, nx, outPtr, fft->comm[1], &info );
998 if( info != 0 )
1000 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
1001 info = -1;
1004 return info;
1008 gmx_fft_2d(gmx_fft_t fft,
1009 enum gmx_fft_direction dir,
1010 void * in_data,
1011 void * out_data)
1013 int inpl = (in_data == out_data);
1014 int info = 0;
1015 int mode = (dir == GMX_FFT_FORWARD) ? -1 : 1;
1017 if( (fft->real_fft == 1) || (fft->ndim != 2) ||
1018 ((dir != GMX_FFT_FORWARD) && (dir != GMX_FFT_BACKWARD)) )
1020 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
1021 return EINVAL;
1024 ACML_FFT2DX( mode, 1.0f, TRUE, inpl, fft->nx, fft->ny,
1025 in_data, 1, fft->nx, out_data, 1, fft->nx, fft->comm[0], &info );
1027 if( info != 0 )
1029 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
1030 info = -1;
1033 return info;
1038 gmx_fft_2d_real(gmx_fft_t fft,
1039 enum gmx_fft_direction dir,
1040 void * inPtr,
1041 void * outPtr )
1043 int inpl = (inPtr == outPtr);
1044 int info = 0;
1045 int i = 0;
1047 int nx = fft->nx;
1048 int ny = fft->ny;
1049 int nyc = (fft->ny/2 + 1);
1050 int nyLen = 0;
1052 /* Depending on whether we are calculating the FFT in place or not, the rows of the
1053 * 2D FFT will be packed or not.
1055 if( inpl )
1057 /* Not packed; 1 or 2 extra reals padding at end of each row */
1058 nyLen = nyc*2;
1060 else
1062 nyLen = ny;
1065 if( (fft->real_fft != 1) || (fft->ndim != 2) ||
1066 ((dir != GMX_FFT_REAL_TO_COMPLEX) && (dir != GMX_FFT_COMPLEX_TO_REAL)) )
1068 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
1069 return EINVAL;
1072 /* I apply a correction scale to the automatic scaling that was done in the real-complex step */
1073 float recipCorrection = sqrtf( ny );
1075 if(dir==GMX_FFT_REAL_TO_COMPLEX)
1077 /* ACML needs to do complex2complex intermediate transforms, which will not fit in the amount
1078 * of memory allocated by the gromacs program, which assumes real. The real2complex transforms
1079 * are also only in-place, so we manually do a memcpy() first */
1080 if( !inpl )
1082 memcpy( outPtr, inPtr, nx*ny*sizeof( real ) );
1085 /* real-to-complex in Y dimension, in-place
1086 * SCFFTM is not valid to call here. SCFFTM does not take any stride information, and assumes that
1087 * the rows are tightly packed. GROMACS pads rows with either 1 or 2 extra reals depending
1088 * on even or odd lengths.
1090 for( i = 0; i < nx; ++i )
1092 if ( info == 0 )
1093 ACML_RCFFT1D( 1, ny, (real*)outPtr+i*nyLen, fft->comm[1], &info );
1096 hermitianUnPacking( 1.0f, nx, ny, outPtr, nyLen, fft->realScratch );
1098 /* complex-to-complex in X dimension */
1099 if ( info == 0 )
1100 ACML_FFT1DMX( -1, recipCorrection, FALSE, nyc, nx, fft->realScratch, nyc, 1,
1101 outPtr, nyc, 1, fft->comm[0], &info );
1103 else
1105 /* complex-to-complex in X dimension, in-place */
1106 ACML_FFT1DMX( 1, recipCorrection, FALSE, nyc, nx, inPtr, nyc, 1,
1107 fft->realScratch, nyc, 1, fft->comm[0], &info );
1109 hermitianPacking( 1.0f, nx, ny, fft->realScratch, outPtr, nyLen );
1111 /* complex-to-real in Y dimension
1112 * CSFFTM is not valid to call here. SCFFTM does not take any stride information, and assumes that
1113 * the rows are tightly packed. GROMACS pads rows with either 1 or 2 extra reals depending
1114 * on even or odd lengths.
1116 if ( info == 0 )
1118 for( i = 0; i < nx; ++i )
1120 if ( info == 0 )
1121 ACML_CRFFT1D( 1, ny, (real*)outPtr+i*nyLen, fft->comm[2], &info );
1126 if( info != 0 )
1128 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
1129 info = -1;
1132 return info;
1137 gmx_fft_3d(gmx_fft_t fft,
1138 enum gmx_fft_direction dir,
1139 void * in_data,
1140 void * out_data)
1142 int mode = (dir == GMX_FFT_FORWARD) ? -1 : 1;
1143 int inpl = ( in_data == out_data );
1145 int commSize = 0;
1146 int nx = fft->nx;
1147 int ny = fft->ny;
1148 int nz = fft->nz;
1149 int info = 0;
1151 if( (fft->real_fft == 1) || (fft->ndim != 3) ||
1152 ((dir != GMX_FFT_FORWARD) && (dir != GMX_FFT_BACKWARD)) )
1154 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
1155 return EINVAL;
1158 commSize = (nx*ny*nz+4*(nx+ny+nz)+300)*sizeof( acmlComplex );
1160 ACML_FFT3DX( mode, 1.0f, inpl, nx, ny, nz, in_data, 1, nx, nx*ny,
1161 out_data, 1, nx, nx*ny, fft->comm[0], commSize, &info );
1163 if( info != 0 )
1165 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
1166 info = -1;
1169 return info;
1173 gmx_fft_3d_real(gmx_fft_t fft,
1174 enum gmx_fft_direction dir,
1175 void * inPtr,
1176 void * outPtr)
1178 int inpl = (inPtr == outPtr);
1179 int info = 0;
1180 int i;
1181 int nx,ny,nz, nzLen = 0;
1183 nx = fft->nx;
1184 ny = fft->ny;
1185 nz = fft->nz;
1186 int nzc = (nz/2 + 1);
1188 /* Depending on whether we are calculating the FFT in place or not, the rows of the
1189 * 3D FFT will be packed or not.
1191 if( inpl )
1193 /* Not packed; 1 or 2 extra reals padding at end of each row */
1194 nzLen = nzc*2;
1196 else
1198 nzLen = nz;
1201 if( (fft->real_fft != 1) || (fft->ndim != 3) ||
1202 ((dir != GMX_FFT_REAL_TO_COMPLEX) && (dir != GMX_FFT_COMPLEX_TO_REAL)) )
1204 gmx_fatal(FARGS,"FFT plan mismatch - bad plan or direction.");
1205 return EINVAL;
1208 /* I apply a correction scale to the automatic scaling that was done in the real-complex step */
1209 float recipCorrection = sqrtf( nz );
1211 if(dir==GMX_FFT_REAL_TO_COMPLEX)
1213 /* ACML needs to do complex2complex intermediate transforms, which will not fit in the amount
1214 * of memory allocated by the gromacs program, which assumes real. The real2complex transforms
1215 * are also only in-place, so we manually do a memcpy() first */
1216 if( !inpl )
1218 memcpy( outPtr, inPtr, nx*ny*nz*sizeof( real ) );
1221 /* real-to-complex in Z dimension, in-place
1222 * SCFFTM is not valid to call here. SCFFTM does not take any stride information, and assumes that
1223 * the rows are tightly packed. GROMACS pads rows with either 1 or 2 extra reals depending
1224 * on even or odd lengths.
1226 for( i = 0; i < nx*ny; ++i )
1228 if ( info == 0 )
1229 ACML_RCFFT1D( 1, nz, (real*)outPtr+i*nzLen, fft->comm[2], &info );
1232 hermitianUnPacking( 1.0f, nx*ny, nz, outPtr, nzLen, fft->realScratch );
1234 /* complex-to-complex in Y dimension, in-place */
1235 for(i=0;i<nx;i++)
1237 if ( info == 0 )
1238 ACML_FFT1DMX( -1, 1.0f, TRUE, nzc, ny, (acmlComplex*)fft->realScratch+i*ny*nzc, nzc, 1,
1239 (acmlComplex*)fft->realScratch+i*ny*nzc, nzc, 1, fft->comm[1], &info );
1242 /* complex-to-complex in X dimension, in-place */
1243 if ( info == 0 )
1244 ACML_FFT1DMX( -1, recipCorrection, FALSE, ny*nzc, nx, fft->realScratch, ny*nzc, 1, outPtr, ny*nzc, 1, fft->comm[0], &info );
1246 else
1248 /* complex-to-complex in X dimension, from inPtr to work */
1249 ACML_FFT1DMX( 1, recipCorrection, FALSE, ny*nzc, nx, inPtr, ny*nzc, 1, fft->realScratch, ny*nzc, 1, fft->comm[0], &info );
1251 /* complex-to-complex in Y dimension, in-place */
1252 for( i=0; i < nx; i++ )
1254 if ( info == 0 )
1255 ACML_FFT1DMX( 1, 1.0f, TRUE, nzc, ny, (acmlComplex*)fft->realScratch+i*ny*nzc, nzc, 1,
1256 (acmlComplex*)fft->realScratch+i*ny*nzc, nzc, 1, fft->comm[1], &info );
1259 hermitianPacking( 1.0f, nx*ny, nz, fft->realScratch, outPtr, nzLen );
1261 /* complex-to-real in Z dimension, in-place
1262 * CSFFTM is not valid to call here. CSFFTM does not take any stride information, and assumes that
1263 * the rows are tightly packed. GROMACS pads rows with either 1 or 2 extra reals depending
1264 * on even or odd lengths.
1266 for( i = 0; i < nx*ny; ++i )
1268 if ( info == 0 )
1269 ACML_CRFFT1D( 1, nz, (real*)outPtr+i*nzLen, fft->comm[3], &info );
1274 if( info != 0 )
1276 gmx_fatal(FARGS,"Error executing AMD ACML FFT.");
1277 info = -1;
1280 return info;
1283 void
1284 gmx_fft_destroy(gmx_fft_t fft)
1286 int d;
1288 if(fft != NULL)
1290 for(d=0;d<4;d++)
1292 if(fft->comm[d] != NULL)
1294 free(fft->comm[d]);
1298 if( fft->realScratch != NULL)
1299 free( fft->realScratch );
1301 free( fft );
1305 #else
1307 gmx_fft_acml_empty;
1308 #endif /* GMX_FFT_ACML */