synced with 1.183
[mplayer/greg.git] / libfaad2 / cfft.c
blob6a1b005fc399c30b2d8d42113b3d91450f0bca78
1 /*
2 ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
3 ** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
4 **
5 ** This program is free software; you can redistribute it and/or modify
6 ** it under the terms of the GNU General Public License as published by
7 ** the Free Software Foundation; either version 2 of the License, or
8 ** (at your option) any later version.
9 **
10 ** This program is distributed in the hope that it will be useful,
11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ** GNU General Public License for more details.
15 ** You should have received a copy of the GNU General Public License
16 ** along with this program; if not, write to the Free Software
17 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 ** Any non-GPL usage of this software or parts of this software is strictly
20 ** forbidden.
22 ** Commercial non-GPL licensing of this software is possible.
23 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
25 ** $Id: cfft.c,v 1.27 2004/06/30 12:45:55 menno Exp $
26 **/
29 * Algorithmically based on Fortran-77 FFTPACK
30 * by Paul N. Swarztrauber(Version 4, 1985).
32 * Does even sized fft only
35 /* isign is +1 for backward and -1 for forward transforms */
37 #include "common.h"
38 #include "structs.h"
40 #include <stdlib.h>
42 #include "cfft.h"
43 #include "cfft_tab.h"
46 /* static function declarations */
47 #ifdef USE_SSE
48 static void passf2pos_sse(const uint16_t l1, const complex_t *cc,
49 complex_t *ch, const complex_t *wa);
50 static void passf2pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc,
51 complex_t *ch, const complex_t *wa);
52 static void passf4pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
53 const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);
54 #endif
55 static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,
56 complex_t *ch, const complex_t *wa);
57 static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,
58 complex_t *ch, const complex_t *wa);
59 static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc,
60 complex_t *ch, const complex_t *wa1, const complex_t *wa2, const int8_t isign);
61 static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
62 const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);
63 static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
64 const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);
65 static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
66 const complex_t *wa1, const complex_t *wa2, const complex_t *wa3,
67 const complex_t *wa4, const int8_t isign);
68 INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch,
69 const uint16_t *ifac, const complex_t *wa, const int8_t isign);
70 static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac);
73 /*----------------------------------------------------------------------
74 passf2, passf3, passf4, passf5. Complex FFT passes fwd and bwd.
75 ----------------------------------------------------------------------*/
77 #if 0 //def USE_SSE
78 static void passf2pos_sse(const uint16_t l1, const complex_t *cc,
79 complex_t *ch, const complex_t *wa)
81 uint16_t k, ah, ac;
83 for (k = 0; k < l1; k++)
85 ah = 2*k;
86 ac = 4*k;
88 RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]);
89 IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]);
91 RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
92 IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
96 static void passf2pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc,
97 complex_t *ch, const complex_t *wa)
99 uint16_t i, k, ah, ac;
101 for (k = 0; k < l1; k++)
103 ah = k*ido;
104 ac = 2*k*ido;
106 for (i = 0; i < ido; i+=4)
108 __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14;
109 __m128 m15, m16, m17, m18, m19, m20, m21, m22, m23, m24;
110 __m128 w1, w2, w3, w4;
112 m1 = _mm_load_ps(&RE(cc[ac+i]));
113 m2 = _mm_load_ps(&RE(cc[ac+ido+i]));
114 m5 = _mm_load_ps(&RE(cc[ac+i+2]));
115 m6 = _mm_load_ps(&RE(cc[ac+ido+i+2]));
116 w1 = _mm_load_ps(&RE(wa[i]));
117 w3 = _mm_load_ps(&RE(wa[i+2]));
119 m3 = _mm_add_ps(m1, m2);
120 m15 = _mm_add_ps(m5, m6);
122 m4 = _mm_sub_ps(m1, m2);
123 m16 = _mm_sub_ps(m5, m6);
125 _mm_store_ps(&RE(ch[ah+i]), m3);
126 _mm_store_ps(&RE(ch[ah+i+2]), m15);
129 w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));
130 w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));
132 m7 = _mm_mul_ps(m4, w1);
133 m17 = _mm_mul_ps(m16, w3);
134 m8 = _mm_mul_ps(m4, w2);
135 m18 = _mm_mul_ps(m16, w4);
137 m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));
138 m19 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(2, 0, 2, 0));
139 m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));
140 m20 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(3, 1, 3, 1));
142 m11 = _mm_add_ps(m9, m10);
143 m21 = _mm_add_ps(m19, m20);
144 m12 = _mm_sub_ps(m9, m10);
145 m22 = _mm_sub_ps(m19, m20);
147 m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));
148 m23 = _mm_shuffle_ps(m21, m21, _MM_SHUFFLE(0, 0, 3, 2));
150 m14 = _mm_unpacklo_ps(m12, m13);
151 m24 = _mm_unpacklo_ps(m22, m23);
153 _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);
154 _mm_store_ps(&RE(ch[ah+i+2+l1*ido]), m24);
158 #endif
160 static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,
161 complex_t *ch, const complex_t *wa)
163 uint16_t i, k, ah, ac;
165 if (ido == 1)
167 for (k = 0; k < l1; k++)
169 ah = 2*k;
170 ac = 4*k;
172 RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]);
173 RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
174 IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]);
175 IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
177 } else {
178 for (k = 0; k < l1; k++)
180 ah = k*ido;
181 ac = 2*k*ido;
183 for (i = 0; i < ido; i++)
185 complex_t t2;
187 RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]);
188 RE(t2) = RE(cc[ac+i]) - RE(cc[ac+i+ido]);
190 IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);
191 IM(t2) = IM(cc[ac+i]) - IM(cc[ac+i+ido]);
193 #if 1
194 ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
195 IM(t2), RE(t2), RE(wa[i]), IM(wa[i]));
196 #else
197 ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
198 RE(t2), IM(t2), RE(wa[i]), IM(wa[i]));
199 #endif
205 static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,
206 complex_t *ch, const complex_t *wa)
208 uint16_t i, k, ah, ac;
210 if (ido == 1)
212 for (k = 0; k < l1; k++)
214 ah = 2*k;
215 ac = 4*k;
217 RE(ch[ah]) = RE(cc[ac]) + RE(cc[ac+1]);
218 RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
219 IM(ch[ah]) = IM(cc[ac]) + IM(cc[ac+1]);
220 IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
222 } else {
223 for (k = 0; k < l1; k++)
225 ah = k*ido;
226 ac = 2*k*ido;
228 for (i = 0; i < ido; i++)
230 complex_t t2;
232 RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]);
233 RE(t2) = RE(cc[ac+i]) - RE(cc[ac+i+ido]);
235 IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);
236 IM(t2) = IM(cc[ac+i]) - IM(cc[ac+i+ido]);
238 #if 1
239 ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
240 RE(t2), IM(t2), RE(wa[i]), IM(wa[i]));
241 #else
242 ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
243 IM(t2), RE(t2), RE(wa[i]), IM(wa[i]));
244 #endif
251 static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc,
252 complex_t *ch, const complex_t *wa1, const complex_t *wa2,
253 const int8_t isign)
255 static real_t taur = FRAC_CONST(-0.5);
256 static real_t taui = FRAC_CONST(0.866025403784439);
257 uint16_t i, k, ac, ah;
258 complex_t c2, c3, d2, d3, t2;
260 if (ido == 1)
262 if (isign == 1)
264 for (k = 0; k < l1; k++)
266 ac = 3*k+1;
267 ah = k;
269 RE(t2) = RE(cc[ac]) + RE(cc[ac+1]);
270 IM(t2) = IM(cc[ac]) + IM(cc[ac+1]);
271 RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),taur);
272 IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),taur);
274 RE(ch[ah]) = RE(cc[ac-1]) + RE(t2);
275 IM(ch[ah]) = IM(cc[ac-1]) + IM(t2);
277 RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+1])), taui);
278 IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+1])), taui);
280 RE(ch[ah+l1]) = RE(c2) - IM(c3);
281 IM(ch[ah+l1]) = IM(c2) + RE(c3);
282 RE(ch[ah+2*l1]) = RE(c2) + IM(c3);
283 IM(ch[ah+2*l1]) = IM(c2) - RE(c3);
285 } else {
286 for (k = 0; k < l1; k++)
288 ac = 3*k+1;
289 ah = k;
291 RE(t2) = RE(cc[ac]) + RE(cc[ac+1]);
292 IM(t2) = IM(cc[ac]) + IM(cc[ac+1]);
293 RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),taur);
294 IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),taur);
296 RE(ch[ah]) = RE(cc[ac-1]) + RE(t2);
297 IM(ch[ah]) = IM(cc[ac-1]) + IM(t2);
299 RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+1])), taui);
300 IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+1])), taui);
302 RE(ch[ah+l1]) = RE(c2) + IM(c3);
303 IM(ch[ah+l1]) = IM(c2) - RE(c3);
304 RE(ch[ah+2*l1]) = RE(c2) - IM(c3);
305 IM(ch[ah+2*l1]) = IM(c2) + RE(c3);
308 } else {
309 if (isign == 1)
311 for (k = 0; k < l1; k++)
313 for (i = 0; i < ido; i++)
315 ac = i + (3*k+1)*ido;
316 ah = i + k * ido;
318 RE(t2) = RE(cc[ac]) + RE(cc[ac+ido]);
319 RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),taur);
320 IM(t2) = IM(cc[ac]) + IM(cc[ac+ido]);
321 IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),taur);
323 RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2);
324 IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2);
326 RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+ido])), taui);
327 IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+ido])), taui);
329 RE(d2) = RE(c2) - IM(c3);
330 IM(d3) = IM(c2) - RE(c3);
331 RE(d3) = RE(c2) + IM(c3);
332 IM(d2) = IM(c2) + RE(c3);
334 #if 1
335 ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
336 IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
337 ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
338 IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
339 #else
340 ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
341 RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
342 ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
343 RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
344 #endif
347 } else {
348 for (k = 0; k < l1; k++)
350 for (i = 0; i < ido; i++)
352 ac = i + (3*k+1)*ido;
353 ah = i + k * ido;
355 RE(t2) = RE(cc[ac]) + RE(cc[ac+ido]);
356 RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),taur);
357 IM(t2) = IM(cc[ac]) + IM(cc[ac+ido]);
358 IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),taur);
360 RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2);
361 IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2);
363 RE(c3) = MUL_F((RE(cc[ac]) - RE(cc[ac+ido])), taui);
364 IM(c3) = MUL_F((IM(cc[ac]) - IM(cc[ac+ido])), taui);
366 RE(d2) = RE(c2) + IM(c3);
367 IM(d3) = IM(c2) + RE(c3);
368 RE(d3) = RE(c2) - IM(c3);
369 IM(d2) = IM(c2) - RE(c3);
371 #if 1
372 ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
373 RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
374 ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
375 RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
376 #else
377 ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
378 IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
379 ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
380 IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
381 #endif
388 #ifdef USE_SSE
389 ALIGN static const int32_t negate[4] = { 0x0, 0x0, 0x0, 0x80000000 };
391 __declspec(naked) static void passf4pos_sse(const uint16_t l1, const complex_t *cc,
392 complex_t *ch, const complex_t *wa1, const complex_t *wa2,
393 const complex_t *wa3)
395 __asm {
396 push ebx
397 mov ebx, esp
398 and esp, -16
399 push edi
400 push esi
401 sub esp, 8
402 movzx edi, WORD PTR [ebx+8]
404 movaps xmm1, XMMWORD PTR negate
406 test edi, edi
407 jle l1_is_zero
409 lea esi, DWORD PTR [edi+edi]
410 add esi, esi
411 sub esi, edi
412 add esi, esi
413 add esi, esi
414 add esi, esi
415 mov eax, DWORD PTR [ebx+16]
416 add esi, eax
417 lea ecx, DWORD PTR [edi+edi]
418 add ecx, ecx
419 add ecx, ecx
420 add ecx, ecx
421 add ecx, eax
422 lea edx, DWORD PTR [edi+edi]
423 add edx, edx
424 add edx, edx
425 add edx, eax
426 xor eax, eax
427 mov DWORD PTR [esp], ebp
428 mov ebp, DWORD PTR [ebx+12]
430 fftloop:
431 lea edi, DWORD PTR [eax+eax]
432 add edi, edi
433 movaps xmm2, XMMWORD PTR [ebp+edi*8]
434 movaps xmm0, XMMWORD PTR [ebp+edi*8+16]
435 movaps xmm7, XMMWORD PTR [ebp+edi*8+32]
436 movaps xmm5, XMMWORD PTR [ebp+edi*8+48]
437 movaps xmm6, xmm2
438 addps xmm6, xmm0
439 movaps xmm4, xmm1
440 xorps xmm4, xmm7
441 movaps xmm3, xmm1
442 xorps xmm3, xmm5
443 xorps xmm2, xmm1
444 xorps xmm0, xmm1
445 addps xmm7, xmm5
446 subps xmm2, xmm0
447 movaps xmm0, xmm6
448 shufps xmm0, xmm7, 68
449 subps xmm4, xmm3
450 shufps xmm6, xmm7, 238
451 movaps xmm5, xmm2
452 shufps xmm5, xmm4, 68
453 movaps xmm3, xmm0
454 addps xmm3, xmm6
455 shufps xmm2, xmm4, 187
456 subps xmm0, xmm6
457 movaps xmm4, xmm5
458 addps xmm4, xmm2
459 mov edi, DWORD PTR [ebx+16]
460 movaps XMMWORD PTR [edi+eax*8], xmm3
461 subps xmm5, xmm2
462 movaps XMMWORD PTR [edx+eax*8], xmm4
463 movaps XMMWORD PTR [ecx+eax*8], xmm0
464 movaps XMMWORD PTR [esi+eax*8], xmm5
465 add eax, 2
466 movzx eax, ax
467 movzx edi, WORD PTR [ebx+8]
468 cmp eax, edi
469 jl fftloop
471 mov ebp, DWORD PTR [esp]
473 l1_is_zero:
475 add esp, 8
476 pop esi
477 pop edi
478 mov esp, ebx
479 pop ebx
483 #endif
485 #if 0
486 static void passf4pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc,
487 complex_t *ch, const complex_t *wa1, const complex_t *wa2,
488 const complex_t *wa3)
490 uint16_t i, k, ac, ah;
492 for (k = 0; k < l1; k++)
494 ac = 4*k*ido;
495 ah = k*ido;
497 for (i = 0; i < ido; i+=2)
499 __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15, m16;
500 __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, m17, m18, m19, m20, m21, m22, m23;
501 __m128 w1, w2, w3, w4, w5, w6, m24, m25, m26, m27, m28, m29, m30;
502 __m128 neg1 = _mm_set_ps(-1.0, 1.0, -1.0, 1.0);
504 m1 = _mm_load_ps(&RE(cc[ac+i]));
505 m2 = _mm_load_ps(&RE(cc[ac+i+2*ido]));
506 m3 = _mm_add_ps(m1, m2);
507 m4 = _mm_sub_ps(m1, m2);
509 n1 = _mm_load_ps(&RE(cc[ac+i+ido]));
510 n2 = _mm_load_ps(&RE(cc[ac+i+3*ido]));
511 n3 = _mm_add_ps(n1, n2);
513 n4 = _mm_mul_ps(neg1, n1);
514 n5 = _mm_mul_ps(neg1, n2);
515 n6 = _mm_sub_ps(n4, n5);
517 m5 = _mm_add_ps(m3, n3);
519 n7 = _mm_shuffle_ps(n6, n6, _MM_SHUFFLE(2, 3, 0, 1));
520 n8 = _mm_add_ps(m4, n7);
522 m6 = _mm_sub_ps(m3, n3);
523 n9 = _mm_sub_ps(m4, n7);
525 _mm_store_ps(&RE(ch[ah+i]), m5);
527 #if 0
528 static INLINE void ComplexMult(real_t *y1, real_t *y2,
529 real_t x1, real_t x2, real_t c1, real_t c2)
531 *y1 = MUL_F(x1, c1) + MUL_F(x2, c2);
532 *y2 = MUL_F(x2, c1) - MUL_F(x1, c2);
535 m7.0 = RE(c2)*RE(wa1[i])
536 m7.1 = IM(c2)*IM(wa1[i])
537 m7.2 = RE(c6)*RE(wa1[i+1])
538 m7.3 = IM(c6)*IM(wa1[i+1])
540 m8.0 = RE(c2)*IM(wa1[i])
541 m8.1 = IM(c2)*RE(wa1[i])
542 m8.2 = RE(c6)*IM(wa1[i+1])
543 m8.3 = IM(c6)*RE(wa1[i+1])
545 RE(0) = m7.0 - m7.1
546 IM(0) = m8.0 + m8.1
547 RE(1) = m7.2 - m7.3
548 IM(1) = m8.2 + m8.3
550 ////
551 RE(0) = RE(c2)*RE(wa1[i]) - IM(c2)*IM(wa1[i])
552 IM(0) = RE(c2)*IM(wa1[i]) + IM(c2)*RE(wa1[i])
553 RE(1) = RE(c6)*RE(wa1[i+1]) - IM(c6)*IM(wa1[i+1])
554 IM(1) = RE(c6)*IM(wa1[i+1]) + IM(c6)*RE(wa1[i+1])
555 #endif
557 w1 = _mm_load_ps(&RE(wa1[i]));
558 w3 = _mm_load_ps(&RE(wa2[i]));
559 w5 = _mm_load_ps(&RE(wa3[i]));
561 w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));
562 w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));
563 w6 = _mm_shuffle_ps(w5, w5, _MM_SHUFFLE(2, 3, 0, 1));
565 m7 = _mm_mul_ps(n8, w1);
566 m15 = _mm_mul_ps(m6, w3);
567 m23 = _mm_mul_ps(n9, w5);
568 m8 = _mm_mul_ps(n8, w2);
569 m16 = _mm_mul_ps(m6, w4);
570 m24 = _mm_mul_ps(n9, w6);
572 m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));
573 m17 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(2, 0, 2, 0));
574 m25 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(2, 0, 2, 0));
575 m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));
576 m18 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(3, 1, 3, 1));
577 m26 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(3, 1, 3, 1));
579 m11 = _mm_add_ps(m9, m10);
580 m19 = _mm_add_ps(m17, m18);
581 m27 = _mm_add_ps(m25, m26);
582 m12 = _mm_sub_ps(m9, m10);
583 m20 = _mm_sub_ps(m17, m18);
584 m28 = _mm_sub_ps(m25, m26);
586 m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));
587 m21 = _mm_shuffle_ps(m19, m19, _MM_SHUFFLE(0, 0, 3, 2));
588 m29 = _mm_shuffle_ps(m27, m27, _MM_SHUFFLE(0, 0, 3, 2));
589 m14 = _mm_unpacklo_ps(m12, m13);
590 m22 = _mm_unpacklo_ps(m20, m21);
591 m30 = _mm_unpacklo_ps(m28, m29);
593 _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);
594 _mm_store_ps(&RE(ch[ah+i+2*l1*ido]), m22);
595 _mm_store_ps(&RE(ch[ah+i+3*l1*ido]), m30);
599 #endif
601 static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,
602 complex_t *ch, const complex_t *wa1, const complex_t *wa2,
603 const complex_t *wa3)
605 uint16_t i, k, ac, ah;
607 if (ido == 1)
609 for (k = 0; k < l1; k++)
611 complex_t t1, t2, t3, t4;
613 ac = 4*k;
614 ah = k;
616 RE(t2) = RE(cc[ac]) + RE(cc[ac+2]);
617 RE(t1) = RE(cc[ac]) - RE(cc[ac+2]);
618 IM(t2) = IM(cc[ac]) + IM(cc[ac+2]);
619 IM(t1) = IM(cc[ac]) - IM(cc[ac+2]);
620 RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]);
621 IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]);
622 IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]);
623 RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);
625 RE(ch[ah]) = RE(t2) + RE(t3);
626 RE(ch[ah+2*l1]) = RE(t2) - RE(t3);
628 IM(ch[ah]) = IM(t2) + IM(t3);
629 IM(ch[ah+2*l1]) = IM(t2) - IM(t3);
631 RE(ch[ah+l1]) = RE(t1) + RE(t4);
632 RE(ch[ah+3*l1]) = RE(t1) - RE(t4);
634 IM(ch[ah+l1]) = IM(t1) + IM(t4);
635 IM(ch[ah+3*l1]) = IM(t1) - IM(t4);
637 } else {
638 for (k = 0; k < l1; k++)
640 ac = 4*k*ido;
641 ah = k*ido;
643 for (i = 0; i < ido; i++)
645 complex_t c2, c3, c4, t1, t2, t3, t4;
647 RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]);
648 RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]);
649 IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]);
650 IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]);
651 RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]);
652 IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]);
653 IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]);
654 RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]);
656 RE(c2) = RE(t1) + RE(t4);
657 RE(c4) = RE(t1) - RE(t4);
659 IM(c2) = IM(t1) + IM(t4);
660 IM(c4) = IM(t1) - IM(t4);
662 RE(ch[ah+i]) = RE(t2) + RE(t3);
663 RE(c3) = RE(t2) - RE(t3);
665 IM(ch[ah+i]) = IM(t2) + IM(t3);
666 IM(c3) = IM(t2) - IM(t3);
668 #if 1
669 ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
670 IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i]));
671 ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]),
672 IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i]));
673 ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]),
674 IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i]));
675 #else
676 ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
677 RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i]));
678 ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]),
679 RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i]));
680 ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]),
681 RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i]));
682 #endif
688 static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,
689 complex_t *ch, const complex_t *wa1, const complex_t *wa2,
690 const complex_t *wa3)
692 uint16_t i, k, ac, ah;
694 if (ido == 1)
696 for (k = 0; k < l1; k++)
698 complex_t t1, t2, t3, t4;
700 ac = 4*k;
701 ah = k;
703 RE(t2) = RE(cc[ac]) + RE(cc[ac+2]);
704 RE(t1) = RE(cc[ac]) - RE(cc[ac+2]);
705 IM(t2) = IM(cc[ac]) + IM(cc[ac+2]);
706 IM(t1) = IM(cc[ac]) - IM(cc[ac+2]);
707 RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]);
708 IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]);
709 IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]);
710 RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);
712 RE(ch[ah]) = RE(t2) + RE(t3);
713 RE(ch[ah+2*l1]) = RE(t2) - RE(t3);
715 IM(ch[ah]) = IM(t2) + IM(t3);
716 IM(ch[ah+2*l1]) = IM(t2) - IM(t3);
718 RE(ch[ah+l1]) = RE(t1) - RE(t4);
719 RE(ch[ah+3*l1]) = RE(t1) + RE(t4);
721 IM(ch[ah+l1]) = IM(t1) - IM(t4);
722 IM(ch[ah+3*l1]) = IM(t1) + IM(t4);
724 } else {
725 for (k = 0; k < l1; k++)
727 ac = 4*k*ido;
728 ah = k*ido;
730 for (i = 0; i < ido; i++)
732 complex_t c2, c3, c4, t1, t2, t3, t4;
734 RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]);
735 RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]);
736 IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]);
737 IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]);
738 RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]);
739 IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]);
740 IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]);
741 RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]);
743 RE(c2) = RE(t1) - RE(t4);
744 RE(c4) = RE(t1) + RE(t4);
746 IM(c2) = IM(t1) - IM(t4);
747 IM(c4) = IM(t1) + IM(t4);
749 RE(ch[ah+i]) = RE(t2) + RE(t3);
750 RE(c3) = RE(t2) - RE(t3);
752 IM(ch[ah+i]) = IM(t2) + IM(t3);
753 IM(c3) = IM(t2) - IM(t3);
755 #if 1
756 ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
757 RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i]));
758 ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]),
759 RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i]));
760 ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]),
761 RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i]));
762 #else
763 ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
764 IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i]));
765 ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]),
766 IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i]));
767 ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]),
768 IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i]));
769 #endif
775 static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc,
776 complex_t *ch, const complex_t *wa1, const complex_t *wa2, const complex_t *wa3,
777 const complex_t *wa4, const int8_t isign)
779 static real_t tr11 = FRAC_CONST(0.309016994374947);
780 static real_t ti11 = FRAC_CONST(0.951056516295154);
781 static real_t tr12 = FRAC_CONST(-0.809016994374947);
782 static real_t ti12 = FRAC_CONST(0.587785252292473);
783 uint16_t i, k, ac, ah;
784 complex_t c2, c3, c4, c5, d3, d4, d5, d2, t2, t3, t4, t5;
786 if (ido == 1)
788 if (isign == 1)
790 for (k = 0; k < l1; k++)
792 ac = 5*k + 1;
793 ah = k;
795 RE(t2) = RE(cc[ac]) + RE(cc[ac+3]);
796 IM(t2) = IM(cc[ac]) + IM(cc[ac+3]);
797 RE(t3) = RE(cc[ac+1]) + RE(cc[ac+2]);
798 IM(t3) = IM(cc[ac+1]) + IM(cc[ac+2]);
799 RE(t4) = RE(cc[ac+1]) - RE(cc[ac+2]);
800 IM(t4) = IM(cc[ac+1]) - IM(cc[ac+2]);
801 RE(t5) = RE(cc[ac]) - RE(cc[ac+3]);
802 IM(t5) = IM(cc[ac]) - IM(cc[ac+3]);
804 RE(ch[ah]) = RE(cc[ac-1]) + RE(t2) + RE(t3);
805 IM(ch[ah]) = IM(cc[ac-1]) + IM(t2) + IM(t3);
807 RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12);
808 IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12);
809 RE(c3) = RE(cc[ac-1]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11);
810 IM(c3) = IM(cc[ac-1]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11);
812 ComplexMult(&RE(c5), &RE(c4),
813 ti11, ti12, RE(t5), RE(t4));
814 ComplexMult(&IM(c5), &IM(c4),
815 ti11, ti12, IM(t5), IM(t4));
817 RE(ch[ah+l1]) = RE(c2) - IM(c5);
818 IM(ch[ah+l1]) = IM(c2) + RE(c5);
819 RE(ch[ah+2*l1]) = RE(c3) - IM(c4);
820 IM(ch[ah+2*l1]) = IM(c3) + RE(c4);
821 RE(ch[ah+3*l1]) = RE(c3) + IM(c4);
822 IM(ch[ah+3*l1]) = IM(c3) - RE(c4);
823 RE(ch[ah+4*l1]) = RE(c2) + IM(c5);
824 IM(ch[ah+4*l1]) = IM(c2) - RE(c5);
826 } else {
827 for (k = 0; k < l1; k++)
829 ac = 5*k + 1;
830 ah = k;
832 RE(t2) = RE(cc[ac]) + RE(cc[ac+3]);
833 IM(t2) = IM(cc[ac]) + IM(cc[ac+3]);
834 RE(t3) = RE(cc[ac+1]) + RE(cc[ac+2]);
835 IM(t3) = IM(cc[ac+1]) + IM(cc[ac+2]);
836 RE(t4) = RE(cc[ac+1]) - RE(cc[ac+2]);
837 IM(t4) = IM(cc[ac+1]) - IM(cc[ac+2]);
838 RE(t5) = RE(cc[ac]) - RE(cc[ac+3]);
839 IM(t5) = IM(cc[ac]) - IM(cc[ac+3]);
841 RE(ch[ah]) = RE(cc[ac-1]) + RE(t2) + RE(t3);
842 IM(ch[ah]) = IM(cc[ac-1]) + IM(t2) + IM(t3);
844 RE(c2) = RE(cc[ac-1]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12);
845 IM(c2) = IM(cc[ac-1]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12);
846 RE(c3) = RE(cc[ac-1]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11);
847 IM(c3) = IM(cc[ac-1]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11);
849 ComplexMult(&RE(c4), &RE(c5),
850 ti12, ti11, RE(t5), RE(t4));
851 ComplexMult(&IM(c4), &IM(c5),
852 ti12, ti12, IM(t5), IM(t4));
854 RE(ch[ah+l1]) = RE(c2) + IM(c5);
855 IM(ch[ah+l1]) = IM(c2) - RE(c5);
856 RE(ch[ah+2*l1]) = RE(c3) + IM(c4);
857 IM(ch[ah+2*l1]) = IM(c3) - RE(c4);
858 RE(ch[ah+3*l1]) = RE(c3) - IM(c4);
859 IM(ch[ah+3*l1]) = IM(c3) + RE(c4);
860 RE(ch[ah+4*l1]) = RE(c2) - IM(c5);
861 IM(ch[ah+4*l1]) = IM(c2) + RE(c5);
864 } else {
865 if (isign == 1)
867 for (k = 0; k < l1; k++)
869 for (i = 0; i < ido; i++)
871 ac = i + (k*5 + 1) * ido;
872 ah = i + k * ido;
874 RE(t2) = RE(cc[ac]) + RE(cc[ac+3*ido]);
875 IM(t2) = IM(cc[ac]) + IM(cc[ac+3*ido]);
876 RE(t3) = RE(cc[ac+ido]) + RE(cc[ac+2*ido]);
877 IM(t3) = IM(cc[ac+ido]) + IM(cc[ac+2*ido]);
878 RE(t4) = RE(cc[ac+ido]) - RE(cc[ac+2*ido]);
879 IM(t4) = IM(cc[ac+ido]) - IM(cc[ac+2*ido]);
880 RE(t5) = RE(cc[ac]) - RE(cc[ac+3*ido]);
881 IM(t5) = IM(cc[ac]) - IM(cc[ac+3*ido]);
883 RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2) + RE(t3);
884 IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2) + IM(t3);
886 RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12);
887 IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12);
888 RE(c3) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11);
889 IM(c3) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11);
891 ComplexMult(&RE(c5), &RE(c4),
892 ti11, ti12, RE(t5), RE(t4));
893 ComplexMult(&IM(c5), &IM(c4),
894 ti11, ti12, IM(t5), IM(t4));
896 IM(d2) = IM(c2) + RE(c5);
897 IM(d3) = IM(c3) + RE(c4);
898 RE(d4) = RE(c3) + IM(c4);
899 RE(d5) = RE(c2) + IM(c5);
900 RE(d2) = RE(c2) - IM(c5);
901 IM(d5) = IM(c2) - RE(c5);
902 RE(d3) = RE(c3) - IM(c4);
903 IM(d4) = IM(c3) - RE(c4);
905 #if 1
906 ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
907 IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
908 ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
909 IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
910 ComplexMult(&IM(ch[ah+3*l1*ido]), &RE(ch[ah+3*l1*ido]),
911 IM(d4), RE(d4), RE(wa3[i]), IM(wa3[i]));
912 ComplexMult(&IM(ch[ah+4*l1*ido]), &RE(ch[ah+4*l1*ido]),
913 IM(d5), RE(d5), RE(wa4[i]), IM(wa4[i]));
914 #else
915 ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
916 RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
917 ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
918 RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
919 ComplexMult(&RE(ch[ah+3*l1*ido]), &IM(ch[ah+3*l1*ido]),
920 RE(d4), IM(d4), RE(wa3[i]), IM(wa3[i]));
921 ComplexMult(&RE(ch[ah+4*l1*ido]), &IM(ch[ah+4*l1*ido]),
922 RE(d5), IM(d5), RE(wa4[i]), IM(wa4[i]));
923 #endif
926 } else {
927 for (k = 0; k < l1; k++)
929 for (i = 0; i < ido; i++)
931 ac = i + (k*5 + 1) * ido;
932 ah = i + k * ido;
934 RE(t2) = RE(cc[ac]) + RE(cc[ac+3*ido]);
935 IM(t2) = IM(cc[ac]) + IM(cc[ac+3*ido]);
936 RE(t3) = RE(cc[ac+ido]) + RE(cc[ac+2*ido]);
937 IM(t3) = IM(cc[ac+ido]) + IM(cc[ac+2*ido]);
938 RE(t4) = RE(cc[ac+ido]) - RE(cc[ac+2*ido]);
939 IM(t4) = IM(cc[ac+ido]) - IM(cc[ac+2*ido]);
940 RE(t5) = RE(cc[ac]) - RE(cc[ac+3*ido]);
941 IM(t5) = IM(cc[ac]) - IM(cc[ac+3*ido]);
943 RE(ch[ah]) = RE(cc[ac-ido]) + RE(t2) + RE(t3);
944 IM(ch[ah]) = IM(cc[ac-ido]) + IM(t2) + IM(t3);
946 RE(c2) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr11) + MUL_F(RE(t3),tr12);
947 IM(c2) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr11) + MUL_F(IM(t3),tr12);
948 RE(c3) = RE(cc[ac-ido]) + MUL_F(RE(t2),tr12) + MUL_F(RE(t3),tr11);
949 IM(c3) = IM(cc[ac-ido]) + MUL_F(IM(t2),tr12) + MUL_F(IM(t3),tr11);
951 ComplexMult(&RE(c4), &RE(c5),
952 ti12, ti11, RE(t5), RE(t4));
953 ComplexMult(&IM(c4), &IM(c5),
954 ti12, ti12, IM(t5), IM(t4));
956 IM(d2) = IM(c2) - RE(c5);
957 IM(d3) = IM(c3) - RE(c4);
958 RE(d4) = RE(c3) - IM(c4);
959 RE(d5) = RE(c2) - IM(c5);
960 RE(d2) = RE(c2) + IM(c5);
961 IM(d5) = IM(c2) + RE(c5);
962 RE(d3) = RE(c3) + IM(c4);
963 IM(d4) = IM(c3) + RE(c4);
965 #if 1
966 ComplexMult(&RE(ch[ah+l1*ido]), &IM(ch[ah+l1*ido]),
967 RE(d2), IM(d2), RE(wa1[i]), IM(wa1[i]));
968 ComplexMult(&RE(ch[ah+2*l1*ido]), &IM(ch[ah+2*l1*ido]),
969 RE(d3), IM(d3), RE(wa2[i]), IM(wa2[i]));
970 ComplexMult(&RE(ch[ah+3*l1*ido]), &IM(ch[ah+3*l1*ido]),
971 RE(d4), IM(d4), RE(wa3[i]), IM(wa3[i]));
972 ComplexMult(&RE(ch[ah+4*l1*ido]), &IM(ch[ah+4*l1*ido]),
973 RE(d5), IM(d5), RE(wa4[i]), IM(wa4[i]));
974 #else
975 ComplexMult(&IM(ch[ah+l1*ido]), &RE(ch[ah+l1*ido]),
976 IM(d2), RE(d2), RE(wa1[i]), IM(wa1[i]));
977 ComplexMult(&IM(ch[ah+2*l1*ido]), &RE(ch[ah+2*l1*ido]),
978 IM(d3), RE(d3), RE(wa2[i]), IM(wa2[i]));
979 ComplexMult(&IM(ch[ah+3*l1*ido]), &RE(ch[ah+3*l1*ido]),
980 IM(d4), RE(d4), RE(wa3[i]), IM(wa3[i]));
981 ComplexMult(&IM(ch[ah+4*l1*ido]), &RE(ch[ah+4*l1*ido]),
982 IM(d5), RE(d5), RE(wa4[i]), IM(wa4[i]));
983 #endif
991 /*----------------------------------------------------------------------
992 cfftf1, cfftf, cfftb, cffti1, cffti. Complex FFTs.
993 ----------------------------------------------------------------------*/
995 #ifdef USE_SSE
997 #define CONV(A,B,C) ( (A<<2) | ((B & 0x1)<<1) | ((C==1)&0x1) )
999 static INLINE void cfftf1pos_sse(uint16_t n, complex_t *c, complex_t *ch,
1000 const uint16_t *ifac, const complex_t *wa,
1001 const int8_t isign)
1003 uint16_t i;
1004 uint16_t k1, l1, l2;
1005 uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
1007 nf = ifac[1];
1008 na = 0;
1009 l1 = 1;
1010 iw = 0;
1012 for (k1 = 2; k1 <= nf+1; k1++)
1014 ip = ifac[k1];
1015 l2 = ip*l1;
1016 ido = n / l2;
1017 idl1 = ido*l1;
1019 ix2 = iw + ido;
1020 ix3 = ix2 + ido;
1021 ix4 = ix3 + ido;
1023 switch (CONV(ip,na,ido))
1025 case CONV(4,0,0):
1026 //passf4pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
1027 passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
1028 break;
1029 case CONV(4,0,1):
1030 passf4pos_sse((const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
1031 break;
1032 case CONV(4,1,0):
1033 passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
1034 //passf4pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
1035 break;
1036 case CONV(4,1,1):
1037 passf4pos_sse((const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
1038 break;
1039 case CONV(2,0,0):
1040 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
1041 //passf2pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
1042 break;
1043 case CONV(2,0,1):
1044 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
1045 //passf2pos_sse((const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
1046 break;
1047 case CONV(2,1,0):
1048 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
1049 //passf2pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
1050 break;
1051 case CONV(2,1,1):
1052 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
1053 //passf2pos_sse((const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
1054 break;
1055 case CONV(3,0,0):
1056 case CONV(3,0,1):
1057 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);
1058 break;
1059 case CONV(3,1,0):
1060 case CONV(3,1,1):
1061 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);
1062 break;
1063 case CONV(5,0,0):
1064 case CONV(5,0,1):
1065 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1066 break;
1067 case CONV(5,1,0):
1068 case CONV(5,1,1):
1069 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1070 break;
1073 na = 1 - na;
1075 l1 = l2;
1076 iw += (ip-1) * ido;
1079 if (na == 0)
1080 return;
1082 for (i = 0; i < n; i++)
1084 RE(c[i]) = RE(ch[i]);
1085 IM(c[i]) = IM(ch[i]);
1088 #endif
1090 static INLINE void cfftf1pos(uint16_t n, complex_t *c, complex_t *ch,
1091 const uint16_t *ifac, const complex_t *wa,
1092 const int8_t isign)
1094 uint16_t i;
1095 uint16_t k1, l1, l2;
1096 uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
1098 nf = ifac[1];
1099 na = 0;
1100 l1 = 1;
1101 iw = 0;
1103 for (k1 = 2; k1 <= nf+1; k1++)
1105 ip = ifac[k1];
1106 l2 = ip*l1;
1107 ido = n / l2;
1108 idl1 = ido*l1;
1110 switch (ip)
1112 case 4:
1113 ix2 = iw + ido;
1114 ix3 = ix2 + ido;
1116 if (na == 0)
1117 passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
1118 else
1119 passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
1121 na = 1 - na;
1122 break;
1123 case 2:
1124 if (na == 0)
1125 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
1126 else
1127 passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
1129 na = 1 - na;
1130 break;
1131 case 3:
1132 ix2 = iw + ido;
1134 if (na == 0)
1135 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);
1136 else
1137 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);
1139 na = 1 - na;
1140 break;
1141 case 5:
1142 ix2 = iw + ido;
1143 ix3 = ix2 + ido;
1144 ix4 = ix3 + ido;
1146 if (na == 0)
1147 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1148 else
1149 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1151 na = 1 - na;
1152 break;
1155 l1 = l2;
1156 iw += (ip-1) * ido;
1159 if (na == 0)
1160 return;
1162 for (i = 0; i < n; i++)
1164 RE(c[i]) = RE(ch[i]);
1165 IM(c[i]) = IM(ch[i]);
1169 static INLINE void cfftf1neg(uint16_t n, complex_t *c, complex_t *ch,
1170 const uint16_t *ifac, const complex_t *wa,
1171 const int8_t isign)
1173 uint16_t i;
1174 uint16_t k1, l1, l2;
1175 uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
1177 nf = ifac[1];
1178 na = 0;
1179 l1 = 1;
1180 iw = 0;
1182 for (k1 = 2; k1 <= nf+1; k1++)
1184 ip = ifac[k1];
1185 l2 = ip*l1;
1186 ido = n / l2;
1187 idl1 = ido*l1;
1189 switch (ip)
1191 case 4:
1192 ix2 = iw + ido;
1193 ix3 = ix2 + ido;
1195 if (na == 0)
1196 passf4neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
1197 else
1198 passf4neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
1200 na = 1 - na;
1201 break;
1202 case 2:
1203 if (na == 0)
1204 passf2neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
1205 else
1206 passf2neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
1208 na = 1 - na;
1209 break;
1210 case 3:
1211 ix2 = iw + ido;
1213 if (na == 0)
1214 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);
1215 else
1216 passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);
1218 na = 1 - na;
1219 break;
1220 case 5:
1221 ix2 = iw + ido;
1222 ix3 = ix2 + ido;
1223 ix4 = ix3 + ido;
1225 if (na == 0)
1226 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1227 else
1228 passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1230 na = 1 - na;
1231 break;
1234 l1 = l2;
1235 iw += (ip-1) * ido;
1238 if (na == 0)
1239 return;
1241 for (i = 0; i < n; i++)
1243 RE(c[i]) = RE(ch[i]);
1244 IM(c[i]) = IM(ch[i]);
1248 void cfftf(cfft_info *cfft, complex_t *c)
1250 cfftf1neg(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, -1);
1253 void cfftb(cfft_info *cfft, complex_t *c)
1255 cfftf1pos(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1);
1258 #ifdef USE_SSE
1259 void cfftb_sse(cfft_info *cfft, complex_t *c)
1261 cfftf1pos_sse(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1);
1263 #endif
1265 static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac)
1267 static uint16_t ntryh[4] = {3, 4, 2, 5};
1268 #ifndef FIXED_POINT
1269 real_t arg, argh, argld, fi;
1270 uint16_t ido, ipm;
1271 uint16_t i1, k1, l1, l2;
1272 uint16_t ld, ii, ip;
1273 #endif
1274 uint16_t ntry = 0, i, j;
1275 uint16_t ib;
1276 uint16_t nf, nl, nq, nr;
1278 nl = n;
1279 nf = 0;
1280 j = 0;
1282 startloop:
1283 j++;
1285 if (j <= 4)
1286 ntry = ntryh[j-1];
1287 else
1288 ntry += 2;
1292 nq = nl / ntry;
1293 nr = nl - ntry*nq;
1295 if (nr != 0)
1296 goto startloop;
1298 nf++;
1299 ifac[nf+1] = ntry;
1300 nl = nq;
1302 if (ntry == 2 && nf != 1)
1304 for (i = 2; i <= nf; i++)
1306 ib = nf - i + 2;
1307 ifac[ib+1] = ifac[ib];
1309 ifac[2] = 2;
1311 } while (nl != 1);
1313 ifac[0] = n;
1314 ifac[1] = nf;
1316 #ifndef FIXED_POINT
1317 argh = (real_t)2.0*(real_t)M_PI / (real_t)n;
1318 i = 0;
1319 l1 = 1;
1321 for (k1 = 1; k1 <= nf; k1++)
1323 ip = ifac[k1+1];
1324 ld = 0;
1325 l2 = l1*ip;
1326 ido = n / l2;
1327 ipm = ip - 1;
1329 for (j = 0; j < ipm; j++)
1331 i1 = i;
1332 RE(wa[i]) = 1.0;
1333 IM(wa[i]) = 0.0;
1334 ld += l1;
1335 fi = 0;
1336 argld = ld*argh;
1338 for (ii = 0; ii < ido; ii++)
1340 i++;
1341 fi++;
1342 arg = fi * argld;
1343 RE(wa[i]) = (real_t)cos(arg);
1344 #if 1
1345 IM(wa[i]) = (real_t)sin(arg);
1346 #else
1347 IM(wa[i]) = (real_t)-sin(arg);
1348 #endif
1351 if (ip > 5)
1353 RE(wa[i1]) = RE(wa[i]);
1354 IM(wa[i1]) = IM(wa[i]);
1357 l1 = l2;
1359 #endif
1362 cfft_info *cffti(uint16_t n)
1364 cfft_info *cfft = (cfft_info*)faad_malloc(sizeof(cfft_info));
1366 cfft->n = n;
1367 cfft->work = (complex_t*)faad_malloc(n*sizeof(complex_t));
1369 #ifndef FIXED_POINT
1370 cfft->tab = (complex_t*)faad_malloc(n*sizeof(complex_t));
1372 cffti1(n, cfft->tab, cfft->ifac);
1373 #else
1374 cffti1(n, NULL, cfft->ifac);
1376 switch (n)
1378 case 64: cfft->tab = (complex_t*)cfft_tab_64; break;
1379 case 512: cfft->tab = (complex_t*)cfft_tab_512; break;
1380 #ifdef LD_DEC
1381 case 256: cfft->tab = (complex_t*)cfft_tab_256; break;
1382 #endif
1384 #ifdef ALLOW_SMALL_FRAMELENGTH
1385 case 60: cfft->tab = (complex_t*)cfft_tab_60; break;
1386 case 480: cfft->tab = (complex_t*)cfft_tab_480; break;
1387 #ifdef LD_DEC
1388 case 240: cfft->tab = (complex_t*)cfft_tab_240; break;
1389 #endif
1390 #endif
1392 #endif
1394 return cfft;
1397 void cfftu(cfft_info *cfft)
1399 if (cfft->work) faad_free(cfft->work);
1400 #ifndef FIXED_POINT
1401 if (cfft->tab) faad_free(cfft->tab);
1402 #endif
1404 if (cfft) faad_free(cfft);