Merge pull request #113 from tesselode/fix-multi-targets
[wdl/wdl-ol.git] / WDL / fft.c
blob8b15d6ca35be23b480f46be171d2f7b4003e3383
1 /*
2 WDL - fft.cpp
3 Copyright (C) 2006 and later Cockos Incorporated
4 Copyright 1999 D. J. Bernstein
6 This software is provided 'as-is', without any express or implied
7 warranty. In no event will the authors be held liable for any damages
8 arising from the use of this software.
10 Permission is granted to anyone to use this software for any purpose,
11 including commercial applications, and to alter it and redistribute it
12 freely, subject to the following restrictions:
14 1. The origin of this software must not be misrepresented; you must not
15 claim that you wrote the original software. If you use this software
16 in a product, an acknowledgment in the product documentation would be
17 appreciated but is not required.
18 2. Altered source versions must be plainly marked as such, and must not be
19 misrepresented as being the original software.
20 3. This notice may not be removed or altered from any source distribution.
24 This file implements the WDL FFT library. These routines are based on the
25 DJBFFT library, which are Copyright 1999 D. J. Bernstein, djb@pobox.com
27 The DJB FFT web page is: http://cr.yp.to/djbfft.html
32 // this is based on djbfft
34 #include <math.h>
35 #include "fft.h"
38 #define FFT_MAXBITLEN 15
40 #ifdef _MSC_VER
41 #define inline __inline
42 #endif
44 #define PI 3.1415926535897932384626433832795
46 static WDL_FFT_COMPLEX d16[3];
47 static WDL_FFT_COMPLEX d32[7];
48 static WDL_FFT_COMPLEX d64[15];
49 static WDL_FFT_COMPLEX d128[31];
50 static WDL_FFT_COMPLEX d256[63];
51 static WDL_FFT_COMPLEX d512[127];
52 static WDL_FFT_COMPLEX d1024[127];
53 static WDL_FFT_COMPLEX d2048[255];
54 static WDL_FFT_COMPLEX d4096[511];
55 static WDL_FFT_COMPLEX d8192[1023];
56 static WDL_FFT_COMPLEX d16384[2047];
57 static WDL_FFT_COMPLEX d32768[4095];
60 #define sqrthalf (d16[1].re)
62 #define VOL *(volatile WDL_FFT_REAL *)&
64 #define TRANSFORM(a0,a1,a2,a3,wre,wim) { \
65 t6 = a2.re; \
66 t1 = a0.re - t6; \
67 t6 += a0.re; \
68 a0.re = t6; \
69 t3 = a3.im; \
70 t4 = a1.im - t3; \
71 t8 = t1 - t4; \
72 t1 += t4; \
73 t3 += a1.im; \
74 a1.im = t3; \
75 t5 = wre; \
76 t7 = t8 * t5; \
77 t4 = t1 * t5; \
78 t8 *= wim; \
79 t2 = a3.re; \
80 t3 = a1.re - t2; \
81 t2 += a1.re; \
82 a1.re = t2; \
83 t1 *= wim; \
84 t6 = a2.im; \
85 t2 = a0.im - t6; \
86 t6 += a0.im; \
87 a0.im = t6; \
88 t6 = t2 + t3; \
89 t2 -= t3; \
90 t3 = t6 * wim; \
91 t7 -= t3; \
92 a2.re = t7; \
93 t6 *= t5; \
94 t6 += t8; \
95 a2.im = t6; \
96 t5 *= t2; \
97 t5 -= t1; \
98 a3.im = t5; \
99 t2 *= wim; \
100 t4 += t2; \
101 a3.re = t4; \
104 #define TRANSFORMHALF(a0,a1,a2,a3) { \
105 t1 = a2.re; \
106 t5 = a0.re - t1; \
107 t1 += a0.re; \
108 a0.re = t1; \
109 t4 = a3.im; \
110 t8 = a1.im - t4; \
111 t1 = t5 - t8; \
112 t5 += t8; \
113 t4 += a1.im; \
114 a1.im = t4; \
115 t3 = a3.re; \
116 t7 = a1.re - t3; \
117 t3 += a1.re; \
118 a1.re = t3; \
119 t8 = a2.im; \
120 t6 = a0.im - t8; \
121 t2 = t6 + t7; \
122 t6 -= t7; \
123 t8 += a0.im; \
124 a0.im = t8; \
125 t4 = t6 + t5; \
126 t3 = sqrthalf; \
127 t4 *= t3; \
128 a3.re = t4; \
129 t6 -= t5; \
130 t6 *= t3; \
131 a3.im = t6; \
132 t7 = t1 - t2; \
133 t7 *= t3; \
134 a2.re = t7; \
135 t2 += t1; \
136 t2 *= t3; \
137 a2.im = t2; \
140 #define TRANSFORMZERO(a0,a1,a2,a3) { \
141 t5 = a2.re; \
142 t1 = a0.re - t5; \
143 t5 += a0.re; \
144 a0.re = t5; \
145 t8 = a3.im; \
146 t4 = a1.im - t8; \
147 t7 = a3.re; \
148 t6 = t1 - t4; \
149 a2.re = t6; \
150 t1 += t4; \
151 a3.re = t1; \
152 t8 += a1.im; \
153 a1.im = t8; \
154 t3 = a1.re - t7; \
155 t7 += a1.re; \
156 a1.re = t7; \
157 t6 = a2.im; \
158 t2 = a0.im - t6; \
159 t7 = t2 + t3; \
160 a2.im = t7; \
161 t2 -= t3; \
162 a3.im = t2; \
163 t6 += a0.im; \
164 a0.im = t6; \
167 #define UNTRANSFORM(a0,a1,a2,a3,wre,wim) { \
168 t6 = VOL wre; \
169 t1 = VOL a2.re; \
170 t1 *= t6; \
171 t8 = VOL wim; \
172 t3 = VOL a2.im; \
173 t3 *= t8; \
174 t2 = VOL a2.im; \
175 t4 = VOL a2.re; \
176 t5 = VOL a3.re; \
177 t5 *= t6; \
178 t7 = VOL a3.im; \
179 t1 += t3; \
180 t7 *= t8; \
181 t5 -= t7; \
182 t3 = t5 + t1; \
183 t5 -= t1; \
184 t2 *= t6; \
185 t6 *= a3.im; \
186 t4 *= t8; \
187 t2 -= t4; \
188 t8 *= a3.re; \
189 t6 += t8; \
190 t1 = a0.re - t3; \
191 t3 += a0.re; \
192 a0.re = t3; \
193 t7 = a1.im - t5; \
194 t5 += a1.im; \
195 a1.im = t5; \
196 t4 = t2 - t6; \
197 t6 += t2; \
198 t8 = a1.re - t4; \
199 t4 += a1.re; \
200 a1.re = t4; \
201 t2 = a0.im - t6; \
202 t6 += a0.im; \
203 a0.im = t6; \
204 a2.re = t1; \
205 a3.im = t7; \
206 a3.re = t8; \
207 a2.im = t2; \
211 #define UNTRANSFORMHALF(a0,a1,a2,a3) { \
212 t6 = sqrthalf; \
213 t1 = a2.re; \
214 t2 = a2.im - t1; \
215 t2 *= t6; \
216 t1 += a2.im; \
217 t1 *= t6; \
218 t4 = a3.im; \
219 t3 = a3.re - t4; \
220 t3 *= t6; \
221 t4 += a3.re; \
222 t4 *= t6; \
223 t8 = t3 - t1; \
224 t7 = t2 - t4; \
225 t1 += t3; \
226 t2 += t4; \
227 t4 = a1.im - t8; \
228 a3.im = t4; \
229 t8 += a1.im; \
230 a1.im = t8; \
231 t3 = a1.re - t7; \
232 a3.re = t3; \
233 t7 += a1.re; \
234 a1.re = t7; \
235 t5 = a0.re - t1; \
236 a2.re = t5; \
237 t1 += a0.re; \
238 a0.re = t1; \
239 t6 = a0.im - t2; \
240 a2.im = t6; \
241 t2 += a0.im; \
242 a0.im = t2; \
245 #define UNTRANSFORMZERO(a0,a1,a2,a3) { \
246 t2 = a3.im; \
247 t3 = a2.im - t2; \
248 t2 += a2.im; \
249 t1 = a2.re; \
250 t4 = a3.re - t1; \
251 t1 += a3.re; \
252 t5 = a0.re - t1; \
253 a2.re = t5; \
254 t6 = a0.im - t2; \
255 a2.im = t6; \
256 t7 = a1.re - t3; \
257 a3.re = t7; \
258 t8 = a1.im - t4; \
259 a3.im = t8; \
260 t1 += a0.re; \
261 a0.re = t1; \
262 t2 += a0.im; \
263 a0.im = t2; \
264 t3 += a1.re; \
265 a1.re = t3; \
266 t4 += a1.im; \
267 a1.im = t4; \
270 static void c2(register WDL_FFT_COMPLEX *a)
272 register WDL_FFT_REAL t1;
274 t1 = a[1].re;
275 a[1].re = a[0].re - t1;
276 a[0].re += t1;
278 t1 = a[1].im;
279 a[1].im = a[0].im - t1;
280 a[0].im += t1;
283 static inline void c4(register WDL_FFT_COMPLEX *a)
285 register WDL_FFT_REAL t1, t2, t3, t4, t5, t6, t7, t8;
287 t5 = a[2].re;
288 t1 = a[0].re - t5;
289 t7 = a[3].re;
290 t5 += a[0].re;
291 t3 = a[1].re - t7;
292 t7 += a[1].re;
293 t8 = t5 + t7;
294 a[0].re = t8;
295 t5 -= t7;
296 a[1].re = t5;
297 t6 = a[2].im;
298 t2 = a[0].im - t6;
299 t6 += a[0].im;
300 t5 = a[3].im;
301 a[2].im = t2 + t3;
302 t2 -= t3;
303 a[3].im = t2;
304 t4 = a[1].im - t5;
305 a[3].re = t1 + t4;
306 t1 -= t4;
307 a[2].re = t1;
308 t5 += a[1].im;
309 a[0].im = t6 + t5;
310 t6 -= t5;
311 a[1].im = t6;
314 static void c8(register WDL_FFT_COMPLEX *a)
316 register WDL_FFT_REAL t1, t2, t3, t4, t5, t6, t7, t8;
318 t7 = a[4].im;
319 t4 = a[0].im - t7;
320 t7 += a[0].im;
321 a[0].im = t7;
323 t8 = a[6].re;
324 t5 = a[2].re - t8;
325 t8 += a[2].re;
326 a[2].re = t8;
328 t7 = a[6].im;
329 a[6].im = t4 - t5;
330 t4 += t5;
331 a[4].im = t4;
333 t6 = a[2].im - t7;
334 t7 += a[2].im;
335 a[2].im = t7;
337 t8 = a[4].re;
338 t3 = a[0].re - t8;
339 t8 += a[0].re;
340 a[0].re = t8;
342 a[4].re = t3 - t6;
343 t3 += t6;
344 a[6].re = t3;
346 t7 = a[5].re;
347 t3 = a[1].re - t7;
348 t7 += a[1].re;
349 a[1].re = t7;
351 t8 = a[7].im;
352 t6 = a[3].im - t8;
353 t8 += a[3].im;
354 a[3].im = t8;
355 t1 = t3 - t6;
356 t3 += t6;
358 t7 = a[5].im;
359 t4 = a[1].im - t7;
360 t7 += a[1].im;
361 a[1].im = t7;
363 t8 = a[7].re;
364 t5 = a[3].re - t8;
365 t8 += a[3].re;
366 a[3].re = t8;
368 t2 = t4 - t5;
369 t4 += t5;
371 t6 = t1 - t4;
372 t8 = sqrthalf;
373 t6 *= t8;
374 a[5].re = a[4].re - t6;
375 t1 += t4;
376 t1 *= t8;
377 a[5].im = a[4].im - t1;
378 t6 += a[4].re;
379 a[4].re = t6;
380 t1 += a[4].im;
381 a[4].im = t1;
383 t5 = t2 - t3;
384 t5 *= t8;
385 a[7].im = a[6].im - t5;
386 t2 += t3;
387 t2 *= t8;
388 a[7].re = a[6].re - t2;
389 t2 += a[6].re;
390 a[6].re = t2;
391 t5 += a[6].im;
392 a[6].im = t5;
394 c4(a);
397 static void c16(register WDL_FFT_COMPLEX *a)
399 register WDL_FFT_REAL t1, t2, t3, t4, t5, t6, t7, t8;
401 TRANSFORMZERO(a[0],a[4],a[8],a[12]);
402 TRANSFORM(a[1],a[5],a[9],a[13],d16[0].re,d16[0].im);
403 TRANSFORMHALF(a[2],a[6],a[10],a[14]);
404 TRANSFORM(a[3],a[7],a[11],a[15],d16[0].im,d16[0].re);
405 c4(a + 8);
406 c4(a + 12);
408 c8(a);
411 /* a[0...8n-1], w[0...2n-2]; n >= 2 */
412 static void cpass(register WDL_FFT_COMPLEX *a,register const WDL_FFT_COMPLEX *w,register unsigned int n)
414 register WDL_FFT_REAL t1, t2, t3, t4, t5, t6, t7, t8;
415 register WDL_FFT_COMPLEX *a1;
416 register WDL_FFT_COMPLEX *a2;
417 register WDL_FFT_COMPLEX *a3;
419 a2 = a + 4 * n;
420 a1 = a + 2 * n;
421 a3 = a2 + 2 * n;
422 --n;
424 TRANSFORMZERO(a[0],a1[0],a2[0],a3[0]);
425 TRANSFORM(a[1],a1[1],a2[1],a3[1],w[0].re,w[0].im);
427 for (;;) {
428 TRANSFORM(a[2],a1[2],a2[2],a3[2],w[1].re,w[1].im);
429 TRANSFORM(a[3],a1[3],a2[3],a3[3],w[2].re,w[2].im);
430 if (!--n) break;
431 a += 2;
432 a1 += 2;
433 a2 += 2;
434 a3 += 2;
435 w += 2;
439 static void c32(register WDL_FFT_COMPLEX *a)
441 cpass(a,d32,4);
442 c8(a + 16);
443 c8(a + 24);
444 c16(a);
447 static void c64(register WDL_FFT_COMPLEX *a)
449 cpass(a,d64,8);
450 c16(a + 32);
451 c16(a + 48);
452 c32(a);
455 static void c128(register WDL_FFT_COMPLEX *a)
457 cpass(a,d128,16);
458 c32(a + 64);
459 c32(a + 96);
460 c64(a);
463 static void c256(register WDL_FFT_COMPLEX *a)
465 cpass(a,d256,32);
466 c64(a + 128);
467 c64(a + 192);
468 c128(a);
471 static void c512(register WDL_FFT_COMPLEX *a)
473 cpass(a,d512,64);
474 c128(a + 384);
475 c128(a + 256);
476 c256(a);
479 /* a[0...8n-1], w[0...n-2]; n even, n >= 4 */
480 static void cpassbig(register WDL_FFT_COMPLEX *a,register const WDL_FFT_COMPLEX *w,register unsigned int n)
482 register WDL_FFT_REAL t1, t2, t3, t4, t5, t6, t7, t8;
483 register WDL_FFT_COMPLEX *a1;
484 register WDL_FFT_COMPLEX *a2;
485 register WDL_FFT_COMPLEX *a3;
486 register unsigned int k;
488 a2 = a + 4 * n;
489 a1 = a + 2 * n;
490 a3 = a2 + 2 * n;
491 k = n - 2;
493 TRANSFORMZERO(a[0],a1[0],a2[0],a3[0]);
494 TRANSFORM(a[1],a1[1],a2[1],a3[1],w[0].re,w[0].im);
495 a += 2;
496 a1 += 2;
497 a2 += 2;
498 a3 += 2;
500 do {
501 TRANSFORM(a[0],a1[0],a2[0],a3[0],w[1].re,w[1].im);
502 TRANSFORM(a[1],a1[1],a2[1],a3[1],w[2].re,w[2].im);
503 a += 2;
504 a1 += 2;
505 a2 += 2;
506 a3 += 2;
507 w += 2;
508 } while (k -= 2);
510 TRANSFORMHALF(a[0],a1[0],a2[0],a3[0]);
511 TRANSFORM(a[1],a1[1],a2[1],a3[1],w[0].im,w[0].re);
512 a += 2;
513 a1 += 2;
514 a2 += 2;
515 a3 += 2;
517 k = n - 2;
518 do {
519 TRANSFORM(a[0],a1[0],a2[0],a3[0],w[-1].im,w[-1].re);
520 TRANSFORM(a[1],a1[1],a2[1],a3[1],w[-2].im,w[-2].re);
521 a += 2;
522 a1 += 2;
523 a2 += 2;
524 a3 += 2;
525 w -= 2;
526 } while (k -= 2);
530 static void c1024(register WDL_FFT_COMPLEX *a)
532 cpassbig(a,d1024,128);
533 c256(a + 768);
534 c256(a + 512);
535 c512(a);
538 static void c2048(register WDL_FFT_COMPLEX *a)
540 cpassbig(a,d2048,256);
541 c512(a + 1536);
542 c512(a + 1024);
543 c1024(a);
546 static void c4096(register WDL_FFT_COMPLEX *a)
548 cpassbig(a,d4096,512);
549 c1024(a + 3072);
550 c1024(a + 2048);
551 c2048(a);
554 static void c8192(register WDL_FFT_COMPLEX *a)
556 cpassbig(a,d8192,1024);
557 c2048(a + 6144);
558 c2048(a + 4096);
559 c4096(a);
562 static void c16384(register WDL_FFT_COMPLEX *a)
564 cpassbig(a,d16384,2048);
565 c4096(a + 8192 + 4096);
566 c4096(a + 8192);
567 c8192(a);
570 static void c32768(register WDL_FFT_COMPLEX *a)
572 cpassbig(a,d32768,4096);
573 c8192(a + 16384 + 8192);
574 c8192(a + 16384);
575 c16384(a);
579 /* n even, n > 0 */
580 void WDL_fft_complexmul(WDL_FFT_COMPLEX *a,WDL_FFT_COMPLEX *b,int n)
582 register WDL_FFT_REAL t1, t2, t3, t4, t5, t6, t7, t8;
583 if (n<2 || (n&1)) return;
585 do {
586 t1 = a[0].re * b[0].re;
587 t2 = a[0].im * b[0].im;
588 t3 = a[0].im * b[0].re;
589 t4 = a[0].re * b[0].im;
590 t5 = a[1].re * b[1].re;
591 t6 = a[1].im * b[1].im;
592 t7 = a[1].im * b[1].re;
593 t8 = a[1].re * b[1].im;
594 t1 -= t2;
595 t3 += t4;
596 t5 -= t6;
597 t7 += t8;
598 a[0].re = t1;
599 a[1].re = t5;
600 a[0].im = t3;
601 a[1].im = t7;
602 a += 2;
603 b += 2;
604 } while (n -= 2);
607 void WDL_fft_complexmul2(WDL_FFT_COMPLEX *c, WDL_FFT_COMPLEX *a, WDL_FFT_COMPLEX *b, int n)
609 register WDL_FFT_REAL t1, t2, t3, t4, t5, t6, t7, t8;
610 if (n<2 || (n&1)) return;
612 do {
613 t1 = a[0].re * b[0].re;
614 t2 = a[0].im * b[0].im;
615 t3 = a[0].im * b[0].re;
616 t4 = a[0].re * b[0].im;
617 t5 = a[1].re * b[1].re;
618 t6 = a[1].im * b[1].im;
619 t7 = a[1].im * b[1].re;
620 t8 = a[1].re * b[1].im;
621 t1 -= t2;
622 t3 += t4;
623 t5 -= t6;
624 t7 += t8;
625 c[0].re = t1;
626 c[1].re = t5;
627 c[0].im = t3;
628 c[1].im = t7;
629 a += 2;
630 b += 2;
631 c += 2;
632 } while (n -= 2);
634 void WDL_fft_complexmul3(WDL_FFT_COMPLEX *c, WDL_FFT_COMPLEX *a, WDL_FFT_COMPLEX *b, int n)
636 register WDL_FFT_REAL t1, t2, t3, t4, t5, t6, t7, t8;
637 if (n<2 || (n&1)) return;
639 do {
640 t1 = a[0].re * b[0].re;
641 t2 = a[0].im * b[0].im;
642 t3 = a[0].im * b[0].re;
643 t4 = a[0].re * b[0].im;
644 t5 = a[1].re * b[1].re;
645 t6 = a[1].im * b[1].im;
646 t7 = a[1].im * b[1].re;
647 t8 = a[1].re * b[1].im;
648 t1 -= t2;
649 t3 += t4;
650 t5 -= t6;
651 t7 += t8;
652 c[0].re += t1;
653 c[1].re += t5;
654 c[0].im += t3;
655 c[1].im += t7;
656 a += 2;
657 b += 2;
658 c += 2;
659 } while (n -= 2);
663 static inline void u4(register WDL_FFT_COMPLEX *a)
665 register WDL_FFT_REAL t1, t2, t3, t4, t5, t6, t7, t8;
667 t1 = VOL a[1].re;
668 t3 = a[0].re - t1;
669 t6 = VOL a[2].re;
670 t1 += a[0].re;
671 t8 = a[3].re - t6;
672 t6 += a[3].re;
673 a[0].re = t1 + t6;
674 t1 -= t6;
675 a[2].re = t1;
677 t2 = VOL a[1].im;
678 t4 = a[0].im - t2;
679 t2 += a[0].im;
680 t5 = VOL a[3].im;
681 a[1].im = t4 + t8;
682 t4 -= t8;
683 a[3].im = t4;
685 t7 = a[2].im - t5;
686 t5 += a[2].im;
687 a[1].re = t3 + t7;
688 t3 -= t7;
689 a[3].re = t3;
690 a[0].im = t2 + t5;
691 t2 -= t5;
692 a[2].im = t2;
695 static void u8(register WDL_FFT_COMPLEX *a)
697 register WDL_FFT_REAL t1, t2, t3, t4, t5, t6, t7, t8;
699 u4(a);
701 t1 = a[5].re;
702 a[5].re = a[4].re - t1;
703 t1 += a[4].re;
705 t3 = a[7].re;
706 a[7].re = a[6].re - t3;
707 t3 += a[6].re;
709 t8 = t3 - t1;
710 t1 += t3;
712 t6 = a[2].im - t8;
713 t8 += a[2].im;
714 a[2].im = t8;
716 t5 = a[0].re - t1;
717 a[4].re = t5;
718 t1 += a[0].re;
719 a[0].re = t1;
721 t2 = a[5].im;
722 a[5].im = a[4].im - t2;
723 t2 += a[4].im;
725 t4 = a[7].im;
726 a[7].im = a[6].im - t4;
727 t4 += a[6].im;
729 a[6].im = t6;
731 t7 = t2 - t4;
732 t2 += t4;
734 t3 = a[2].re - t7;
735 a[6].re = t3;
736 t7 += a[2].re;
737 a[2].re = t7;
739 t6 = a[0].im - t2;
740 a[4].im = t6;
741 t2 += a[0].im;
742 a[0].im = t2;
744 t6 = sqrthalf;
746 t1 = a[5].re;
747 t2 = a[5].im - t1;
748 t2 *= t6;
749 t1 += a[5].im;
750 t1 *= t6;
751 t4 = a[7].im;
752 t3 = a[7].re - t4;
753 t3 *= t6;
754 t4 += a[7].re;
755 t4 *= t6;
757 t8 = t3 - t1;
758 t1 += t3;
759 t7 = t2 - t4;
760 t2 += t4;
762 t4 = a[3].im - t8;
763 a[7].im = t4;
764 t5 = a[1].re - t1;
765 a[5].re = t5;
766 t3 = a[3].re - t7;
767 a[7].re = t3;
768 t6 = a[1].im - t2;
769 a[5].im = t6;
771 t8 += a[3].im;
772 a[3].im = t8;
773 t1 += a[1].re;
774 a[1].re = t1;
775 t7 += a[3].re;
776 a[3].re = t7;
777 t2 += a[1].im;
778 a[1].im = t2;
781 static void u16(register WDL_FFT_COMPLEX *a)
783 register WDL_FFT_REAL t1, t2, t3, t4, t5, t6, t7, t8;
785 u8(a);
786 u4(a + 8);
787 u4(a + 12);
789 UNTRANSFORMZERO(a[0],a[4],a[8],a[12]);
790 UNTRANSFORMHALF(a[2],a[6],a[10],a[14]);
791 UNTRANSFORM(a[1],a[5],a[9],a[13],d16[0].re,d16[0].im);
792 UNTRANSFORM(a[3],a[7],a[11],a[15],d16[0].im,d16[0].re);
795 /* a[0...8n-1], w[0...2n-2] */
796 static void upass(register WDL_FFT_COMPLEX *a,register const WDL_FFT_COMPLEX *w,register unsigned int n)
798 register WDL_FFT_REAL t1, t2, t3, t4, t5, t6, t7, t8;
799 register WDL_FFT_COMPLEX *a1;
800 register WDL_FFT_COMPLEX *a2;
801 register WDL_FFT_COMPLEX *a3;
803 a2 = a + 4 * n;
804 a1 = a + 2 * n;
805 a3 = a2 + 2 * n;
806 n -= 1;
808 UNTRANSFORMZERO(a[0],a1[0],a2[0],a3[0]);
809 UNTRANSFORM(a[1],a1[1],a2[1],a3[1],w[0].re,w[0].im);
811 for (;;) {
812 UNTRANSFORM(a[2],a1[2],a2[2],a3[2],w[1].re,w[1].im);
813 UNTRANSFORM(a[3],a1[3],a2[3],a3[3],w[2].re,w[2].im);
814 if (!--n) break;
815 a += 2;
816 a1 += 2;
817 a2 += 2;
818 a3 += 2;
819 w += 2;
823 static void u32(register WDL_FFT_COMPLEX *a)
825 u16(a);
826 u8(a + 16);
827 u8(a + 24);
828 upass(a,d32,4);
831 static void u64(register WDL_FFT_COMPLEX *a)
833 u32(a);
834 u16(a + 32);
835 u16(a + 48);
836 upass(a,d64,8);
839 static void u128(register WDL_FFT_COMPLEX *a)
841 u64(a);
842 u32(a + 64);
843 u32(a + 96);
844 upass(a,d128,16);
847 static void u256(register WDL_FFT_COMPLEX *a)
849 u128(a);
850 u64(a + 128);
851 u64(a + 192);
852 upass(a,d256,32);
855 static void u512(register WDL_FFT_COMPLEX *a)
857 u256(a);
858 u128(a + 256);
859 u128(a + 384);
860 upass(a,d512,64);
864 /* a[0...8n-1], w[0...n-2]; n even, n >= 4 */
865 static void upassbig(register WDL_FFT_COMPLEX *a,register const WDL_FFT_COMPLEX *w,register unsigned int n)
867 register WDL_FFT_REAL t1, t2, t3, t4, t5, t6, t7, t8;
868 register WDL_FFT_COMPLEX *a1;
869 register WDL_FFT_COMPLEX *a2;
870 register WDL_FFT_COMPLEX *a3;
871 register unsigned int k;
873 a2 = a + 4 * n;
874 a1 = a + 2 * n;
875 a3 = a2 + 2 * n;
876 k = n - 2;
878 UNTRANSFORMZERO(a[0],a1[0],a2[0],a3[0]);
879 UNTRANSFORM(a[1],a1[1],a2[1],a3[1],w[0].re,w[0].im);
880 a += 2;
881 a1 += 2;
882 a2 += 2;
883 a3 += 2;
885 do {
886 UNTRANSFORM(a[0],a1[0],a2[0],a3[0],w[1].re,w[1].im);
887 UNTRANSFORM(a[1],a1[1],a2[1],a3[1],w[2].re,w[2].im);
888 a += 2;
889 a1 += 2;
890 a2 += 2;
891 a3 += 2;
892 w += 2;
893 } while (k -= 2);
895 UNTRANSFORMHALF(a[0],a1[0],a2[0],a3[0]);
896 UNTRANSFORM(a[1],a1[1],a2[1],a3[1],w[0].im,w[0].re);
897 a += 2;
898 a1 += 2;
899 a2 += 2;
900 a3 += 2;
902 k = n - 2;
903 do {
904 UNTRANSFORM(a[0],a1[0],a2[0],a3[0],w[-1].im,w[-1].re);
905 UNTRANSFORM(a[1],a1[1],a2[1],a3[1],w[-2].im,w[-2].re);
906 a += 2;
907 a1 += 2;
908 a2 += 2;
909 a3 += 2;
910 w -= 2;
911 } while (k -= 2);
916 static void u1024(register WDL_FFT_COMPLEX *a)
918 u512(a);
919 u256(a + 512);
920 u256(a + 768);
921 upassbig(a,d1024,128);
924 static void u2048(register WDL_FFT_COMPLEX *a)
926 u1024(a);
927 u512(a + 1024);
928 u512(a + 1536);
929 upassbig(a,d2048,256);
933 static void u4096(register WDL_FFT_COMPLEX *a)
935 u2048(a);
936 u1024(a + 2048);
937 u1024(a + 3072);
938 upassbig(a,d4096,512);
941 static void u8192(register WDL_FFT_COMPLEX *a)
943 u4096(a);
944 u2048(a + 4096);
945 u2048(a + 6144);
946 upassbig(a,d8192,1024);
949 static void u16384(register WDL_FFT_COMPLEX *a)
951 u8192(a);
952 u4096(a + 8192);
953 u4096(a + 8192 + 4096);
954 upassbig(a,d16384,2048);
957 static void u32768(register WDL_FFT_COMPLEX *a)
959 u16384(a);
960 u8192(a + 16384);
961 u8192(a + 16384 + 8192 );
962 upassbig(a,d32768,4096);
966 static void __fft_gen(WDL_FFT_COMPLEX *buf, const WDL_FFT_COMPLEX *buf2, int sz, int isfull)
968 int x;
969 double div=PI*0.25/(sz+1);
971 if (isfull) div*=2.0;
973 for (x = 0; x < sz; x ++)
975 if (!(x & 1) || !buf2)
977 buf[x].re = (WDL_FFT_REAL) cos((x+1)*div);
978 buf[x].im = (WDL_FFT_REAL) sin((x+1)*div);
980 else
982 buf[x].re = buf2[x >> 1].re;
983 buf[x].im = buf2[x >> 1].im;
988 #ifndef WDL_FFT_NO_PERMUTE
990 static unsigned int fftfreq_c(unsigned int i,unsigned int n)
992 unsigned int m;
994 if (n <= 2) return i;
996 m = n >> 1;
997 if (i < m) return fftfreq_c(i,m) << 1;
999 i -= m;
1000 m >>= 1;
1001 if (i < m) return (fftfreq_c(i,m) << 2) + 1;
1002 i -= m;
1003 return ((fftfreq_c(i,m) << 2) - 1) & (n - 1);
1006 static int _idxperm[2<<FFT_MAXBITLEN];
1008 static void idx_perm_calc(int offs, int n)
1010 int i, j;
1011 _idxperm[offs] = 0;
1012 for (i = 1; i < n; ++i) {
1013 j = fftfreq_c(i, n);
1014 _idxperm[offs+n-j] = i;
1018 int WDL_fft_permute(int fftsize, int idx)
1020 return _idxperm[fftsize+idx-2];
1022 int *WDL_fft_permute_tab(int fftsize)
1024 return _idxperm + fftsize - 2;
1028 #endif
1030 void WDL_fft_init()
1032 static int ffttabinit;
1033 if (!ffttabinit)
1035 int i, offs;
1036 ffttabinit=1;
1038 #define fft_gen(x,y,z) __fft_gen(x,y,sizeof(x)/sizeof(x[0]),z)
1039 fft_gen(d16,0,1);
1040 fft_gen(d32,d16,1);
1041 fft_gen(d64,d32,1);
1042 fft_gen(d128,d64,1);
1043 fft_gen(d256,d128,1);
1044 fft_gen(d512,d256,1);
1045 fft_gen(d1024,d512,0);
1046 fft_gen(d2048,d1024,0);
1047 fft_gen(d4096,d2048,0);
1048 fft_gen(d8192,d4096,0);
1049 fft_gen(d16384,d8192,0);
1050 fft_gen(d32768,d16384,0);
1051 #undef fft_gen
1053 #ifndef WDL_FFT_NO_PERMUTE
1054 offs = 0;
1055 for (i = 2; i <= 32768; i *= 2)
1057 idx_perm_calc(offs, i);
1058 offs += i;
1060 #endif
1065 void WDL_fft(WDL_FFT_COMPLEX *buf, int len, int isInverse)
1067 switch (len)
1069 case 2: c2(buf); break;
1070 #define TMP(x) case x: if (!isInverse) c##x(buf); else u##x(buf); break;
1071 TMP(4)
1072 TMP(8)
1073 TMP(16)
1074 TMP(32)
1075 TMP(64)
1076 TMP(128)
1077 TMP(256)
1078 TMP(512)
1079 TMP(1024)
1080 TMP(2048)
1081 TMP(4096)
1082 TMP(8192)
1083 TMP(16384)
1084 TMP(32768)
1085 #undef TMP
1089 static inline void r2(register WDL_FFT_REAL *a)
1091 register WDL_FFT_REAL t1, t2;
1093 t1 = a[0] + a[1];
1094 t2 = a[0] - a[1];
1095 a[0] = t1 * 2;
1096 a[1] = t2 * 2;
1099 static inline void v2(register WDL_FFT_REAL *a)
1101 register WDL_FFT_REAL t1, t2;
1103 t1 = a[0] + a[1];
1104 t2 = a[0] - a[1];
1105 a[0] = t1;
1106 a[1] = t2;
1109 static void two_for_one(WDL_FFT_REAL* buf, const WDL_FFT_COMPLEX *d, int len, int isInverse)
1111 const unsigned int half = (unsigned)len >> 1, quart = half >> 1, eighth = quart >> 1;
1112 const int *permute = WDL_fft_permute_tab(half);
1113 unsigned int i, j;
1115 WDL_FFT_COMPLEX *p, *q, tw, sum, diff;
1116 WDL_FFT_REAL tw1, tw2;
1118 if (!isInverse)
1120 WDL_fft((WDL_FFT_COMPLEX*)buf, half, isInverse);
1121 r2(buf);
1123 else
1125 v2(buf);
1128 /* Source: http://www.katjaas.nl/realFFT/realFFT2.html */
1130 for (i = 1; i < quart; ++i)
1132 p = (WDL_FFT_COMPLEX*)buf + permute[i];
1133 q = (WDL_FFT_COMPLEX*)buf + permute[half - i];
1135 /* tw.re = cos(2*PI * i / len);
1136 tw.im = sin(2*PI * i / len); */
1138 if (i < eighth)
1140 j = i - 1;
1141 tw.re = d[j].re;
1142 tw.im = d[j].im;
1144 else if (i > eighth)
1146 j = quart - i - 1;
1147 tw.re = d[j].im;
1148 tw.im = d[j].re;
1150 else
1152 tw.re = tw.im = sqrthalf;
1155 if (!isInverse) tw.re = -tw.re;
1157 sum.re = p->re + q->re;
1158 sum.im = p->im + q->im;
1159 diff.re = p->re - q->re;
1160 diff.im = p->im - q->im;
1162 tw1 = tw.re * sum.im + tw.im * diff.re;
1163 tw2 = tw.im * sum.im - tw.re * diff.re;
1165 p->re = sum.re - tw1;
1166 p->im = diff.im - tw2;
1167 q->re = sum.re + tw1;
1168 q->im = -(diff.im + tw2);
1171 p = (WDL_FFT_COMPLEX*)buf + permute[i];
1172 p->re *= 2;
1173 p->im *= -2;
1175 if (isInverse) WDL_fft((WDL_FFT_COMPLEX*)buf, half, isInverse);
1178 void WDL_real_fft(WDL_FFT_REAL* buf, int len, int isInverse)
1180 switch (len)
1182 case 2: if (!isInverse) r2(buf); else v2(buf); break;
1183 case 4: case 8: two_for_one(buf, 0, len, isInverse); break;
1184 #define TMP(x) case x: two_for_one(buf, d##x, len, isInverse); break;
1185 TMP(16)
1186 TMP(32)
1187 TMP(64)
1188 TMP(128)
1189 TMP(256)
1190 TMP(512)
1191 TMP(1024)
1192 TMP(2048)
1193 TMP(4096)
1194 TMP(8192)
1195 TMP(16384)
1196 TMP(32768)
1197 #undef TMP