2 ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
3 ** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
5 ** This program is free software; you can redistribute it and/or modify
6 ** it under the terms of the GNU General Public License as published by
7 ** the Free Software Foundation; either version 2 of the License, or
8 ** (at your option) any later version.
10 ** This program is distributed in the hope that it will be useful,
11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ** GNU General Public License for more details.
15 ** You should have received a copy of the GNU General Public License
16 ** along with this program; if not, write to the Free Software
17 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 ** Any non-GPL usage of this software or parts of this software is strictly
22 ** Commercial non-GPL licensing of this software is possible.
23 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
25 ** $Id: cfft.c,v 1.27 2004/06/30 12:45:55 menno Exp $
29 * Algorithmically based on Fortran-77 FFTPACK
30 * by Paul N. Swarztrauber(Version 4, 1985).
32 * Does even sized fft only
35 /* isign is +1 for backward and -1 for forward transforms */
46 /* static function declarations */
48 static void passf2pos_sse(const uint16_t l1
, const complex_t
*cc
,
49 complex_t
*ch
, const complex_t
*wa
);
50 static void passf2pos_sse_ido(const uint16_t ido
, const uint16_t l1
, const complex_t
*cc
,
51 complex_t
*ch
, const complex_t
*wa
);
52 static void passf4pos_sse_ido(const uint16_t ido
, const uint16_t l1
, const complex_t
*cc
, complex_t
*ch
,
53 const complex_t
*wa1
, const complex_t
*wa2
, const complex_t
*wa3
);
55 static void passf2pos(const uint16_t ido
, const uint16_t l1
, const complex_t
*cc
,
56 complex_t
*ch
, const complex_t
*wa
);
57 static void passf2neg(const uint16_t ido
, const uint16_t l1
, const complex_t
*cc
,
58 complex_t
*ch
, const complex_t
*wa
);
59 static void passf3(const uint16_t ido
, const uint16_t l1
, const complex_t
*cc
,
60 complex_t
*ch
, const complex_t
*wa1
, const complex_t
*wa2
, const int8_t isign
);
61 static void passf4pos(const uint16_t ido
, const uint16_t l1
, const complex_t
*cc
, complex_t
*ch
,
62 const complex_t
*wa1
, const complex_t
*wa2
, const complex_t
*wa3
);
63 static void passf4neg(const uint16_t ido
, const uint16_t l1
, const complex_t
*cc
, complex_t
*ch
,
64 const complex_t
*wa1
, const complex_t
*wa2
, const complex_t
*wa3
);
65 static void passf5(const uint16_t ido
, const uint16_t l1
, const complex_t
*cc
, complex_t
*ch
,
66 const complex_t
*wa1
, const complex_t
*wa2
, const complex_t
*wa3
,
67 const complex_t
*wa4
, const int8_t isign
);
68 INLINE
void cfftf1(uint16_t n
, complex_t
*c
, complex_t
*ch
,
69 const uint16_t *ifac
, const complex_t
*wa
, const int8_t isign
);
70 static void cffti1(uint16_t n
, complex_t
*wa
, uint16_t *ifac
);
73 /*----------------------------------------------------------------------
74 passf2, passf3, passf4, passf5. Complex FFT passes fwd and bwd.
75 ----------------------------------------------------------------------*/
78 static void passf2pos_sse(const uint16_t l1
, const complex_t
*cc
,
79 complex_t
*ch
, const complex_t
*wa
)
83 for (k
= 0; k
< l1
; k
++)
88 RE(ch
[ah
]) = RE(cc
[ac
]) + RE(cc
[ac
+1]);
89 IM(ch
[ah
]) = IM(cc
[ac
]) + IM(cc
[ac
+1]);
91 RE(ch
[ah
+l1
]) = RE(cc
[ac
]) - RE(cc
[ac
+1]);
92 IM(ch
[ah
+l1
]) = IM(cc
[ac
]) - IM(cc
[ac
+1]);
96 static void passf2pos_sse_ido(const uint16_t ido
, const uint16_t l1
, const complex_t
*cc
,
97 complex_t
*ch
, const complex_t
*wa
)
99 uint16_t i
, k
, ah
, ac
;
101 for (k
= 0; k
< l1
; k
++)
106 for (i
= 0; i
< ido
; i
+=4)
108 __m128 m1
, m2
, m3
, m4
, m5
, m6
, m7
, m8
, m9
, m10
, m11
, m12
, m13
, m14
;
109 __m128 m15
, m16
, m17
, m18
, m19
, m20
, m21
, m22
, m23
, m24
;
110 __m128 w1
, w2
, w3
, w4
;
112 m1
= _mm_load_ps(&RE(cc
[ac
+i
]));
113 m2
= _mm_load_ps(&RE(cc
[ac
+ido
+i
]));
114 m5
= _mm_load_ps(&RE(cc
[ac
+i
+2]));
115 m6
= _mm_load_ps(&RE(cc
[ac
+ido
+i
+2]));
116 w1
= _mm_load_ps(&RE(wa
[i
]));
117 w3
= _mm_load_ps(&RE(wa
[i
+2]));
119 m3
= _mm_add_ps(m1
, m2
);
120 m15
= _mm_add_ps(m5
, m6
);
122 m4
= _mm_sub_ps(m1
, m2
);
123 m16
= _mm_sub_ps(m5
, m6
);
125 _mm_store_ps(&RE(ch
[ah
+i
]), m3
);
126 _mm_store_ps(&RE(ch
[ah
+i
+2]), m15
);
129 w2
= _mm_shuffle_ps(w1
, w1
, _MM_SHUFFLE(2, 3, 0, 1));
130 w4
= _mm_shuffle_ps(w3
, w3
, _MM_SHUFFLE(2, 3, 0, 1));
132 m7
= _mm_mul_ps(m4
, w1
);
133 m17
= _mm_mul_ps(m16
, w3
);
134 m8
= _mm_mul_ps(m4
, w2
);
135 m18
= _mm_mul_ps(m16
, w4
);
137 m9
= _mm_shuffle_ps(m7
, m8
, _MM_SHUFFLE(2, 0, 2, 0));
138 m19
= _mm_shuffle_ps(m17
, m18
, _MM_SHUFFLE(2, 0, 2, 0));
139 m10
= _mm_shuffle_ps(m7
, m8
, _MM_SHUFFLE(3, 1, 3, 1));
140 m20
= _mm_shuffle_ps(m17
, m18
, _MM_SHUFFLE(3, 1, 3, 1));
142 m11
= _mm_add_ps(m9
, m10
);
143 m21
= _mm_add_ps(m19
, m20
);
144 m12
= _mm_sub_ps(m9
, m10
);
145 m22
= _mm_sub_ps(m19
, m20
);
147 m13
= _mm_shuffle_ps(m11
, m11
, _MM_SHUFFLE(0, 0, 3, 2));
148 m23
= _mm_shuffle_ps(m21
, m21
, _MM_SHUFFLE(0, 0, 3, 2));
150 m14
= _mm_unpacklo_ps(m12
, m13
);
151 m24
= _mm_unpacklo_ps(m22
, m23
);
153 _mm_store_ps(&RE(ch
[ah
+i
+l1
*ido
]), m14
);
154 _mm_store_ps(&RE(ch
[ah
+i
+2+l1
*ido
]), m24
);
160 static void passf2pos(const uint16_t ido
, const uint16_t l1
, const complex_t
*cc
,
161 complex_t
*ch
, const complex_t
*wa
)
163 uint16_t i
, k
, ah
, ac
;
167 for (k
= 0; k
< l1
; k
++)
172 RE(ch
[ah
]) = RE(cc
[ac
]) + RE(cc
[ac
+1]);
173 RE(ch
[ah
+l1
]) = RE(cc
[ac
]) - RE(cc
[ac
+1]);
174 IM(ch
[ah
]) = IM(cc
[ac
]) + IM(cc
[ac
+1]);
175 IM(ch
[ah
+l1
]) = IM(cc
[ac
]) - IM(cc
[ac
+1]);
178 for (k
= 0; k
< l1
; k
++)
183 for (i
= 0; i
< ido
; i
++)
187 RE(ch
[ah
+i
]) = RE(cc
[ac
+i
]) + RE(cc
[ac
+i
+ido
]);
188 RE(t2
) = RE(cc
[ac
+i
]) - RE(cc
[ac
+i
+ido
]);
190 IM(ch
[ah
+i
]) = IM(cc
[ac
+i
]) + IM(cc
[ac
+i
+ido
]);
191 IM(t2
) = IM(cc
[ac
+i
]) - IM(cc
[ac
+i
+ido
]);
194 ComplexMult(&IM(ch
[ah
+i
+l1
*ido
]), &RE(ch
[ah
+i
+l1
*ido
]),
195 IM(t2
), RE(t2
), RE(wa
[i
]), IM(wa
[i
]));
197 ComplexMult(&RE(ch
[ah
+i
+l1
*ido
]), &IM(ch
[ah
+i
+l1
*ido
]),
198 RE(t2
), IM(t2
), RE(wa
[i
]), IM(wa
[i
]));
205 static void passf2neg(const uint16_t ido
, const uint16_t l1
, const complex_t
*cc
,
206 complex_t
*ch
, const complex_t
*wa
)
208 uint16_t i
, k
, ah
, ac
;
212 for (k
= 0; k
< l1
; k
++)
217 RE(ch
[ah
]) = RE(cc
[ac
]) + RE(cc
[ac
+1]);
218 RE(ch
[ah
+l1
]) = RE(cc
[ac
]) - RE(cc
[ac
+1]);
219 IM(ch
[ah
]) = IM(cc
[ac
]) + IM(cc
[ac
+1]);
220 IM(ch
[ah
+l1
]) = IM(cc
[ac
]) - IM(cc
[ac
+1]);
223 for (k
= 0; k
< l1
; k
++)
228 for (i
= 0; i
< ido
; i
++)
232 RE(ch
[ah
+i
]) = RE(cc
[ac
+i
]) + RE(cc
[ac
+i
+ido
]);
233 RE(t2
) = RE(cc
[ac
+i
]) - RE(cc
[ac
+i
+ido
]);
235 IM(ch
[ah
+i
]) = IM(cc
[ac
+i
]) + IM(cc
[ac
+i
+ido
]);
236 IM(t2
) = IM(cc
[ac
+i
]) - IM(cc
[ac
+i
+ido
]);
239 ComplexMult(&RE(ch
[ah
+i
+l1
*ido
]), &IM(ch
[ah
+i
+l1
*ido
]),
240 RE(t2
), IM(t2
), RE(wa
[i
]), IM(wa
[i
]));
242 ComplexMult(&IM(ch
[ah
+i
+l1
*ido
]), &RE(ch
[ah
+i
+l1
*ido
]),
243 IM(t2
), RE(t2
), RE(wa
[i
]), IM(wa
[i
]));
251 static void passf3(const uint16_t ido
, const uint16_t l1
, const complex_t
*cc
,
252 complex_t
*ch
, const complex_t
*wa1
, const complex_t
*wa2
,
255 static real_t taur
= FRAC_CONST(-0.5);
256 static real_t taui
= FRAC_CONST(0.866025403784439);
257 uint16_t i
, k
, ac
, ah
;
258 complex_t c2
, c3
, d2
, d3
, t2
;
264 for (k
= 0; k
< l1
; k
++)
269 RE(t2
) = RE(cc
[ac
]) + RE(cc
[ac
+1]);
270 IM(t2
) = IM(cc
[ac
]) + IM(cc
[ac
+1]);
271 RE(c2
) = RE(cc
[ac
-1]) + MUL_F(RE(t2
),taur
);
272 IM(c2
) = IM(cc
[ac
-1]) + MUL_F(IM(t2
),taur
);
274 RE(ch
[ah
]) = RE(cc
[ac
-1]) + RE(t2
);
275 IM(ch
[ah
]) = IM(cc
[ac
-1]) + IM(t2
);
277 RE(c3
) = MUL_F((RE(cc
[ac
]) - RE(cc
[ac
+1])), taui
);
278 IM(c3
) = MUL_F((IM(cc
[ac
]) - IM(cc
[ac
+1])), taui
);
280 RE(ch
[ah
+l1
]) = RE(c2
) - IM(c3
);
281 IM(ch
[ah
+l1
]) = IM(c2
) + RE(c3
);
282 RE(ch
[ah
+2*l1
]) = RE(c2
) + IM(c3
);
283 IM(ch
[ah
+2*l1
]) = IM(c2
) - RE(c3
);
286 for (k
= 0; k
< l1
; k
++)
291 RE(t2
) = RE(cc
[ac
]) + RE(cc
[ac
+1]);
292 IM(t2
) = IM(cc
[ac
]) + IM(cc
[ac
+1]);
293 RE(c2
) = RE(cc
[ac
-1]) + MUL_F(RE(t2
),taur
);
294 IM(c2
) = IM(cc
[ac
-1]) + MUL_F(IM(t2
),taur
);
296 RE(ch
[ah
]) = RE(cc
[ac
-1]) + RE(t2
);
297 IM(ch
[ah
]) = IM(cc
[ac
-1]) + IM(t2
);
299 RE(c3
) = MUL_F((RE(cc
[ac
]) - RE(cc
[ac
+1])), taui
);
300 IM(c3
) = MUL_F((IM(cc
[ac
]) - IM(cc
[ac
+1])), taui
);
302 RE(ch
[ah
+l1
]) = RE(c2
) + IM(c3
);
303 IM(ch
[ah
+l1
]) = IM(c2
) - RE(c3
);
304 RE(ch
[ah
+2*l1
]) = RE(c2
) - IM(c3
);
305 IM(ch
[ah
+2*l1
]) = IM(c2
) + RE(c3
);
311 for (k
= 0; k
< l1
; k
++)
313 for (i
= 0; i
< ido
; i
++)
315 ac
= i
+ (3*k
+1)*ido
;
318 RE(t2
) = RE(cc
[ac
]) + RE(cc
[ac
+ido
]);
319 RE(c2
) = RE(cc
[ac
-ido
]) + MUL_F(RE(t2
),taur
);
320 IM(t2
) = IM(cc
[ac
]) + IM(cc
[ac
+ido
]);
321 IM(c2
) = IM(cc
[ac
-ido
]) + MUL_F(IM(t2
),taur
);
323 RE(ch
[ah
]) = RE(cc
[ac
-ido
]) + RE(t2
);
324 IM(ch
[ah
]) = IM(cc
[ac
-ido
]) + IM(t2
);
326 RE(c3
) = MUL_F((RE(cc
[ac
]) - RE(cc
[ac
+ido
])), taui
);
327 IM(c3
) = MUL_F((IM(cc
[ac
]) - IM(cc
[ac
+ido
])), taui
);
329 RE(d2
) = RE(c2
) - IM(c3
);
330 IM(d3
) = IM(c2
) - RE(c3
);
331 RE(d3
) = RE(c2
) + IM(c3
);
332 IM(d2
) = IM(c2
) + RE(c3
);
335 ComplexMult(&IM(ch
[ah
+l1
*ido
]), &RE(ch
[ah
+l1
*ido
]),
336 IM(d2
), RE(d2
), RE(wa1
[i
]), IM(wa1
[i
]));
337 ComplexMult(&IM(ch
[ah
+2*l1
*ido
]), &RE(ch
[ah
+2*l1
*ido
]),
338 IM(d3
), RE(d3
), RE(wa2
[i
]), IM(wa2
[i
]));
340 ComplexMult(&RE(ch
[ah
+l1
*ido
]), &IM(ch
[ah
+l1
*ido
]),
341 RE(d2
), IM(d2
), RE(wa1
[i
]), IM(wa1
[i
]));
342 ComplexMult(&RE(ch
[ah
+2*l1
*ido
]), &IM(ch
[ah
+2*l1
*ido
]),
343 RE(d3
), IM(d3
), RE(wa2
[i
]), IM(wa2
[i
]));
348 for (k
= 0; k
< l1
; k
++)
350 for (i
= 0; i
< ido
; i
++)
352 ac
= i
+ (3*k
+1)*ido
;
355 RE(t2
) = RE(cc
[ac
]) + RE(cc
[ac
+ido
]);
356 RE(c2
) = RE(cc
[ac
-ido
]) + MUL_F(RE(t2
),taur
);
357 IM(t2
) = IM(cc
[ac
]) + IM(cc
[ac
+ido
]);
358 IM(c2
) = IM(cc
[ac
-ido
]) + MUL_F(IM(t2
),taur
);
360 RE(ch
[ah
]) = RE(cc
[ac
-ido
]) + RE(t2
);
361 IM(ch
[ah
]) = IM(cc
[ac
-ido
]) + IM(t2
);
363 RE(c3
) = MUL_F((RE(cc
[ac
]) - RE(cc
[ac
+ido
])), taui
);
364 IM(c3
) = MUL_F((IM(cc
[ac
]) - IM(cc
[ac
+ido
])), taui
);
366 RE(d2
) = RE(c2
) + IM(c3
);
367 IM(d3
) = IM(c2
) + RE(c3
);
368 RE(d3
) = RE(c2
) - IM(c3
);
369 IM(d2
) = IM(c2
) - RE(c3
);
372 ComplexMult(&RE(ch
[ah
+l1
*ido
]), &IM(ch
[ah
+l1
*ido
]),
373 RE(d2
), IM(d2
), RE(wa1
[i
]), IM(wa1
[i
]));
374 ComplexMult(&RE(ch
[ah
+2*l1
*ido
]), &IM(ch
[ah
+2*l1
*ido
]),
375 RE(d3
), IM(d3
), RE(wa2
[i
]), IM(wa2
[i
]));
377 ComplexMult(&IM(ch
[ah
+l1
*ido
]), &RE(ch
[ah
+l1
*ido
]),
378 IM(d2
), RE(d2
), RE(wa1
[i
]), IM(wa1
[i
]));
379 ComplexMult(&IM(ch
[ah
+2*l1
*ido
]), &RE(ch
[ah
+2*l1
*ido
]),
380 IM(d3
), RE(d3
), RE(wa2
[i
]), IM(wa2
[i
]));
389 ALIGN
static const int32_t negate
[4] = { 0x0, 0x0, 0x0, 0x80000000 };
391 __declspec(naked
) static void passf4pos_sse(const uint16_t l1
, const complex_t
*cc
,
392 complex_t
*ch
, const complex_t
*wa1
, const complex_t
*wa2
,
393 const complex_t
*wa3
)
402 movzx edi
, WORD PTR
[ebx
+8]
404 movaps xmm1
, XMMWORD PTR negate
409 lea esi
, DWORD PTR
[edi
+edi
]
415 mov eax
, DWORD PTR
[ebx
+16]
417 lea ecx
, DWORD PTR
[edi
+edi
]
422 lea edx
, DWORD PTR
[edi
+edi
]
427 mov DWORD PTR
[esp
], ebp
428 mov ebp
, DWORD PTR
[ebx
+12]
431 lea edi
, DWORD PTR
[eax
+eax
]
433 movaps xmm2
, XMMWORD PTR
[ebp
+edi
*8]
434 movaps xmm0
, XMMWORD PTR
[ebp
+edi
*8+16]
435 movaps xmm7
, XMMWORD PTR
[ebp
+edi
*8+32]
436 movaps xmm5
, XMMWORD PTR
[ebp
+edi
*8+48]
448 shufps xmm0
, xmm7
, 68
450 shufps xmm6
, xmm7
, 238
452 shufps xmm5
, xmm4
, 68
455 shufps xmm2
, xmm4
, 187
459 mov edi
, DWORD PTR
[ebx
+16]
460 movaps XMMWORD PTR
[edi
+eax
*8], xmm3
462 movaps XMMWORD PTR
[edx
+eax
*8], xmm4
463 movaps XMMWORD PTR
[ecx
+eax
*8], xmm0
464 movaps XMMWORD PTR
[esi
+eax
*8], xmm5
467 movzx edi
, WORD PTR
[ebx
+8]
471 mov ebp
, DWORD PTR
[esp
]
486 static void passf4pos_sse_ido(const uint16_t ido
, const uint16_t l1
, const complex_t
*cc
,
487 complex_t
*ch
, const complex_t
*wa1
, const complex_t
*wa2
,
488 const complex_t
*wa3
)
490 uint16_t i
, k
, ac
, ah
;
492 for (k
= 0; k
< l1
; k
++)
497 for (i
= 0; i
< ido
; i
+=2)
499 __m128 m1
, m2
, m3
, m4
, m5
, m6
, m7
, m8
, m9
, m10
, m11
, m12
, m13
, m14
, m15
, m16
;
500 __m128 n1
, n2
, n3
, n4
, n5
, n6
, n7
, n8
, n9
, m17
, m18
, m19
, m20
, m21
, m22
, m23
;
501 __m128 w1
, w2
, w3
, w4
, w5
, w6
, m24
, m25
, m26
, m27
, m28
, m29
, m30
;
502 __m128 neg1
= _mm_set_ps(-1.0, 1.0, -1.0, 1.0);
504 m1
= _mm_load_ps(&RE(cc
[ac
+i
]));
505 m2
= _mm_load_ps(&RE(cc
[ac
+i
+2*ido
]));
506 m3
= _mm_add_ps(m1
, m2
);
507 m4
= _mm_sub_ps(m1
, m2
);
509 n1
= _mm_load_ps(&RE(cc
[ac
+i
+ido
]));
510 n2
= _mm_load_ps(&RE(cc
[ac
+i
+3*ido
]));
511 n3
= _mm_add_ps(n1
, n2
);
513 n4
= _mm_mul_ps(neg1
, n1
);
514 n5
= _mm_mul_ps(neg1
, n2
);
515 n6
= _mm_sub_ps(n4
, n5
);
517 m5
= _mm_add_ps(m3
, n3
);
519 n7
= _mm_shuffle_ps(n6
, n6
, _MM_SHUFFLE(2, 3, 0, 1));
520 n8
= _mm_add_ps(m4
, n7
);
522 m6
= _mm_sub_ps(m3
, n3
);
523 n9
= _mm_sub_ps(m4
, n7
);
525 _mm_store_ps(&RE(ch
[ah
+i
]), m5
);
528 static INLINE
void ComplexMult(real_t
*y1
, real_t
*y2
,
529 real_t x1
, real_t x2
, real_t c1
, real_t c2
)
531 *y1
= MUL_F(x1
, c1
) + MUL_F(x2
, c2
);
532 *y2
= MUL_F(x2
, c1
) - MUL_F(x1
, c2
);
535 m7
.0
= RE(c2
)*RE(wa1
[i
])
536 m7
.1
= IM(c2
)*IM(wa1
[i
])
537 m7
.2
= RE(c6
)*RE(wa1
[i
+1])
538 m7
.3
= IM(c6
)*IM(wa1
[i
+1])
540 m8
.0
= RE(c2
)*IM(wa1
[i
])
541 m8
.1
= IM(c2
)*RE(wa1
[i
])
542 m8
.2
= RE(c6
)*IM(wa1
[i
+1])
543 m8
.3
= IM(c6
)*RE(wa1
[i
+1])
551 RE(0) = RE(c2
)*RE(wa1
[i
]) - IM(c2
)*IM(wa1
[i
])
552 IM(0) = RE(c2
)*IM(wa1
[i
]) + IM(c2
)*RE(wa1
[i
])
553 RE(1) = RE(c6
)*RE(wa1
[i
+1]) - IM(c6
)*IM(wa1
[i
+1])
554 IM(1) = RE(c6
)*IM(wa1
[i
+1]) + IM(c6
)*RE(wa1
[i
+1])
557 w1
= _mm_load_ps(&RE(wa1
[i
]));
558 w3
= _mm_load_ps(&RE(wa2
[i
]));
559 w5
= _mm_load_ps(&RE(wa3
[i
]));
561 w2
= _mm_shuffle_ps(w1
, w1
, _MM_SHUFFLE(2, 3, 0, 1));
562 w4
= _mm_shuffle_ps(w3
, w3
, _MM_SHUFFLE(2, 3, 0, 1));
563 w6
= _mm_shuffle_ps(w5
, w5
, _MM_SHUFFLE(2, 3, 0, 1));
565 m7
= _mm_mul_ps(n8
, w1
);
566 m15
= _mm_mul_ps(m6
, w3
);
567 m23
= _mm_mul_ps(n9
, w5
);
568 m8
= _mm_mul_ps(n8
, w2
);
569 m16
= _mm_mul_ps(m6
, w4
);
570 m24
= _mm_mul_ps(n9
, w6
);
572 m9
= _mm_shuffle_ps(m7
, m8
, _MM_SHUFFLE(2, 0, 2, 0));
573 m17
= _mm_shuffle_ps(m15
, m16
, _MM_SHUFFLE(2, 0, 2, 0));
574 m25
= _mm_shuffle_ps(m23
, m24
, _MM_SHUFFLE(2, 0, 2, 0));
575 m10
= _mm_shuffle_ps(m7
, m8
, _MM_SHUFFLE(3, 1, 3, 1));
576 m18
= _mm_shuffle_ps(m15
, m16
, _MM_SHUFFLE(3, 1, 3, 1));
577 m26
= _mm_shuffle_ps(m23
, m24
, _MM_SHUFFLE(3, 1, 3, 1));
579 m11
= _mm_add_ps(m9
, m10
);
580 m19
= _mm_add_ps(m17
, m18
);
581 m27
= _mm_add_ps(m25
, m26
);
582 m12
= _mm_sub_ps(m9
, m10
);
583 m20
= _mm_sub_ps(m17
, m18
);
584 m28
= _mm_sub_ps(m25
, m26
);
586 m13
= _mm_shuffle_ps(m11
, m11
, _MM_SHUFFLE(0, 0, 3, 2));
587 m21
= _mm_shuffle_ps(m19
, m19
, _MM_SHUFFLE(0, 0, 3, 2));
588 m29
= _mm_shuffle_ps(m27
, m27
, _MM_SHUFFLE(0, 0, 3, 2));
589 m14
= _mm_unpacklo_ps(m12
, m13
);
590 m22
= _mm_unpacklo_ps(m20
, m21
);
591 m30
= _mm_unpacklo_ps(m28
, m29
);
593 _mm_store_ps(&RE(ch
[ah
+i
+l1
*ido
]), m14
);
594 _mm_store_ps(&RE(ch
[ah
+i
+2*l1
*ido
]), m22
);
595 _mm_store_ps(&RE(ch
[ah
+i
+3*l1
*ido
]), m30
);
601 static void passf4pos(const uint16_t ido
, const uint16_t l1
, const complex_t
*cc
,
602 complex_t
*ch
, const complex_t
*wa1
, const complex_t
*wa2
,
603 const complex_t
*wa3
)
605 uint16_t i
, k
, ac
, ah
;
609 for (k
= 0; k
< l1
; k
++)
611 complex_t t1
, t2
, t3
, t4
;
616 RE(t2
) = RE(cc
[ac
]) + RE(cc
[ac
+2]);
617 RE(t1
) = RE(cc
[ac
]) - RE(cc
[ac
+2]);
618 IM(t2
) = IM(cc
[ac
]) + IM(cc
[ac
+2]);
619 IM(t1
) = IM(cc
[ac
]) - IM(cc
[ac
+2]);
620 RE(t3
) = RE(cc
[ac
+1]) + RE(cc
[ac
+3]);
621 IM(t4
) = RE(cc
[ac
+1]) - RE(cc
[ac
+3]);
622 IM(t3
) = IM(cc
[ac
+3]) + IM(cc
[ac
+1]);
623 RE(t4
) = IM(cc
[ac
+3]) - IM(cc
[ac
+1]);
625 RE(ch
[ah
]) = RE(t2
) + RE(t3
);
626 RE(ch
[ah
+2*l1
]) = RE(t2
) - RE(t3
);
628 IM(ch
[ah
]) = IM(t2
) + IM(t3
);
629 IM(ch
[ah
+2*l1
]) = IM(t2
) - IM(t3
);
631 RE(ch
[ah
+l1
]) = RE(t1
) + RE(t4
);
632 RE(ch
[ah
+3*l1
]) = RE(t1
) - RE(t4
);
634 IM(ch
[ah
+l1
]) = IM(t1
) + IM(t4
);
635 IM(ch
[ah
+3*l1
]) = IM(t1
) - IM(t4
);
638 for (k
= 0; k
< l1
; k
++)
643 for (i
= 0; i
< ido
; i
++)
645 complex_t c2
, c3
, c4
, t1
, t2
, t3
, t4
;
647 RE(t2
) = RE(cc
[ac
+i
]) + RE(cc
[ac
+i
+2*ido
]);
648 RE(t1
) = RE(cc
[ac
+i
]) - RE(cc
[ac
+i
+2*ido
]);
649 IM(t2
) = IM(cc
[ac
+i
]) + IM(cc
[ac
+i
+2*ido
]);
650 IM(t1
) = IM(cc
[ac
+i
]) - IM(cc
[ac
+i
+2*ido
]);
651 RE(t3
) = RE(cc
[ac
+i
+ido
]) + RE(cc
[ac
+i
+3*ido
]);
652 IM(t4
) = RE(cc
[ac
+i
+ido
]) - RE(cc
[ac
+i
+3*ido
]);
653 IM(t3
) = IM(cc
[ac
+i
+3*ido
]) + IM(cc
[ac
+i
+ido
]);
654 RE(t4
) = IM(cc
[ac
+i
+3*ido
]) - IM(cc
[ac
+i
+ido
]);
656 RE(c2
) = RE(t1
) + RE(t4
);
657 RE(c4
) = RE(t1
) - RE(t4
);
659 IM(c2
) = IM(t1
) + IM(t4
);
660 IM(c4
) = IM(t1
) - IM(t4
);
662 RE(ch
[ah
+i
]) = RE(t2
) + RE(t3
);
663 RE(c3
) = RE(t2
) - RE(t3
);
665 IM(ch
[ah
+i
]) = IM(t2
) + IM(t3
);
666 IM(c3
) = IM(t2
) - IM(t3
);
669 ComplexMult(&IM(ch
[ah
+i
+l1
*ido
]), &RE(ch
[ah
+i
+l1
*ido
]),
670 IM(c2
), RE(c2
), RE(wa1
[i
]), IM(wa1
[i
]));
671 ComplexMult(&IM(ch
[ah
+i
+2*l1
*ido
]), &RE(ch
[ah
+i
+2*l1
*ido
]),
672 IM(c3
), RE(c3
), RE(wa2
[i
]), IM(wa2
[i
]));
673 ComplexMult(&IM(ch
[ah
+i
+3*l1
*ido
]), &RE(ch
[ah
+i
+3*l1
*ido
]),
674 IM(c4
), RE(c4
), RE(wa3
[i
]), IM(wa3
[i
]));
676 ComplexMult(&RE(ch
[ah
+i
+l1
*ido
]), &IM(ch
[ah
+i
+l1
*ido
]),
677 RE(c2
), IM(c2
), RE(wa1
[i
]), IM(wa1
[i
]));
678 ComplexMult(&RE(ch
[ah
+i
+2*l1
*ido
]), &IM(ch
[ah
+i
+2*l1
*ido
]),
679 RE(c3
), IM(c3
), RE(wa2
[i
]), IM(wa2
[i
]));
680 ComplexMult(&RE(ch
[ah
+i
+3*l1
*ido
]), &IM(ch
[ah
+i
+3*l1
*ido
]),
681 RE(c4
), IM(c4
), RE(wa3
[i
]), IM(wa3
[i
]));
688 static void passf4neg(const uint16_t ido
, const uint16_t l1
, const complex_t
*cc
,
689 complex_t
*ch
, const complex_t
*wa1
, const complex_t
*wa2
,
690 const complex_t
*wa3
)
692 uint16_t i
, k
, ac
, ah
;
696 for (k
= 0; k
< l1
; k
++)
698 complex_t t1
, t2
, t3
, t4
;
703 RE(t2
) = RE(cc
[ac
]) + RE(cc
[ac
+2]);
704 RE(t1
) = RE(cc
[ac
]) - RE(cc
[ac
+2]);
705 IM(t2
) = IM(cc
[ac
]) + IM(cc
[ac
+2]);
706 IM(t1
) = IM(cc
[ac
]) - IM(cc
[ac
+2]);
707 RE(t3
) = RE(cc
[ac
+1]) + RE(cc
[ac
+3]);
708 IM(t4
) = RE(cc
[ac
+1]) - RE(cc
[ac
+3]);
709 IM(t3
) = IM(cc
[ac
+3]) + IM(cc
[ac
+1]);
710 RE(t4
) = IM(cc
[ac
+3]) - IM(cc
[ac
+1]);
712 RE(ch
[ah
]) = RE(t2
) + RE(t3
);
713 RE(ch
[ah
+2*l1
]) = RE(t2
) - RE(t3
);
715 IM(ch
[ah
]) = IM(t2
) + IM(t3
);
716 IM(ch
[ah
+2*l1
]) = IM(t2
) - IM(t3
);
718 RE(ch
[ah
+l1
]) = RE(t1
) - RE(t4
);
719 RE(ch
[ah
+3*l1
]) = RE(t1
) + RE(t4
);
721 IM(ch
[ah
+l1
]) = IM(t1
) - IM(t4
);
722 IM(ch
[ah
+3*l1
]) = IM(t1
) + IM(t4
);
725 for (k
= 0; k
< l1
; k
++)
730 for (i
= 0; i
< ido
; i
++)
732 complex_t c2
, c3
, c4
, t1
, t2
, t3
, t4
;
734 RE(t2
) = RE(cc
[ac
+i
]) + RE(cc
[ac
+i
+2*ido
]);
735 RE(t1
) = RE(cc
[ac
+i
]) - RE(cc
[ac
+i
+2*ido
]);
736 IM(t2
) = IM(cc
[ac
+i
]) + IM(cc
[ac
+i
+2*ido
]);
737 IM(t1
) = IM(cc
[ac
+i
]) - IM(cc
[ac
+i
+2*ido
]);
738 RE(t3
) = RE(cc
[ac
+i
+ido
]) + RE(cc
[ac
+i
+3*ido
]);
739 IM(t4
) = RE(cc
[ac
+i
+ido
]) - RE(cc
[ac
+i
+3*ido
]);
740 IM(t3
) = IM(cc
[ac
+i
+3*ido
]) + IM(cc
[ac
+i
+ido
]);
741 RE(t4
) = IM(cc
[ac
+i
+3*ido
]) - IM(cc
[ac
+i
+ido
]);
743 RE(c2
) = RE(t1
) - RE(t4
);
744 RE(c4
) = RE(t1
) + RE(t4
);
746 IM(c2
) = IM(t1
) - IM(t4
);
747 IM(c4
) = IM(t1
) + IM(t4
);
749 RE(ch
[ah
+i
]) = RE(t2
) + RE(t3
);
750 RE(c3
) = RE(t2
) - RE(t3
);
752 IM(ch
[ah
+i
]) = IM(t2
) + IM(t3
);
753 IM(c3
) = IM(t2
) - IM(t3
);
756 ComplexMult(&RE(ch
[ah
+i
+l1
*ido
]), &IM(ch
[ah
+i
+l1
*ido
]),
757 RE(c2
), IM(c2
), RE(wa1
[i
]), IM(wa1
[i
]));
758 ComplexMult(&RE(ch
[ah
+i
+2*l1
*ido
]), &IM(ch
[ah
+i
+2*l1
*ido
]),
759 RE(c3
), IM(c3
), RE(wa2
[i
]), IM(wa2
[i
]));
760 ComplexMult(&RE(ch
[ah
+i
+3*l1
*ido
]), &IM(ch
[ah
+i
+3*l1
*ido
]),
761 RE(c4
), IM(c4
), RE(wa3
[i
]), IM(wa3
[i
]));
763 ComplexMult(&IM(ch
[ah
+i
+l1
*ido
]), &RE(ch
[ah
+i
+l1
*ido
]),
764 IM(c2
), RE(c2
), RE(wa1
[i
]), IM(wa1
[i
]));
765 ComplexMult(&IM(ch
[ah
+i
+2*l1
*ido
]), &RE(ch
[ah
+i
+2*l1
*ido
]),
766 IM(c3
), RE(c3
), RE(wa2
[i
]), IM(wa2
[i
]));
767 ComplexMult(&IM(ch
[ah
+i
+3*l1
*ido
]), &RE(ch
[ah
+i
+3*l1
*ido
]),
768 IM(c4
), RE(c4
), RE(wa3
[i
]), IM(wa3
[i
]));
775 static void passf5(const uint16_t ido
, const uint16_t l1
, const complex_t
*cc
,
776 complex_t
*ch
, const complex_t
*wa1
, const complex_t
*wa2
, const complex_t
*wa3
,
777 const complex_t
*wa4
, const int8_t isign
)
779 static real_t tr11
= FRAC_CONST(0.309016994374947);
780 static real_t ti11
= FRAC_CONST(0.951056516295154);
781 static real_t tr12
= FRAC_CONST(-0.809016994374947);
782 static real_t ti12
= FRAC_CONST(0.587785252292473);
783 uint16_t i
, k
, ac
, ah
;
784 complex_t c2
, c3
, c4
, c5
, d3
, d4
, d5
, d2
, t2
, t3
, t4
, t5
;
790 for (k
= 0; k
< l1
; k
++)
795 RE(t2
) = RE(cc
[ac
]) + RE(cc
[ac
+3]);
796 IM(t2
) = IM(cc
[ac
]) + IM(cc
[ac
+3]);
797 RE(t3
) = RE(cc
[ac
+1]) + RE(cc
[ac
+2]);
798 IM(t3
) = IM(cc
[ac
+1]) + IM(cc
[ac
+2]);
799 RE(t4
) = RE(cc
[ac
+1]) - RE(cc
[ac
+2]);
800 IM(t4
) = IM(cc
[ac
+1]) - IM(cc
[ac
+2]);
801 RE(t5
) = RE(cc
[ac
]) - RE(cc
[ac
+3]);
802 IM(t5
) = IM(cc
[ac
]) - IM(cc
[ac
+3]);
804 RE(ch
[ah
]) = RE(cc
[ac
-1]) + RE(t2
) + RE(t3
);
805 IM(ch
[ah
]) = IM(cc
[ac
-1]) + IM(t2
) + IM(t3
);
807 RE(c2
) = RE(cc
[ac
-1]) + MUL_F(RE(t2
),tr11
) + MUL_F(RE(t3
),tr12
);
808 IM(c2
) = IM(cc
[ac
-1]) + MUL_F(IM(t2
),tr11
) + MUL_F(IM(t3
),tr12
);
809 RE(c3
) = RE(cc
[ac
-1]) + MUL_F(RE(t2
),tr12
) + MUL_F(RE(t3
),tr11
);
810 IM(c3
) = IM(cc
[ac
-1]) + MUL_F(IM(t2
),tr12
) + MUL_F(IM(t3
),tr11
);
812 ComplexMult(&RE(c5
), &RE(c4
),
813 ti11
, ti12
, RE(t5
), RE(t4
));
814 ComplexMult(&IM(c5
), &IM(c4
),
815 ti11
, ti12
, IM(t5
), IM(t4
));
817 RE(ch
[ah
+l1
]) = RE(c2
) - IM(c5
);
818 IM(ch
[ah
+l1
]) = IM(c2
) + RE(c5
);
819 RE(ch
[ah
+2*l1
]) = RE(c3
) - IM(c4
);
820 IM(ch
[ah
+2*l1
]) = IM(c3
) + RE(c4
);
821 RE(ch
[ah
+3*l1
]) = RE(c3
) + IM(c4
);
822 IM(ch
[ah
+3*l1
]) = IM(c3
) - RE(c4
);
823 RE(ch
[ah
+4*l1
]) = RE(c2
) + IM(c5
);
824 IM(ch
[ah
+4*l1
]) = IM(c2
) - RE(c5
);
827 for (k
= 0; k
< l1
; k
++)
832 RE(t2
) = RE(cc
[ac
]) + RE(cc
[ac
+3]);
833 IM(t2
) = IM(cc
[ac
]) + IM(cc
[ac
+3]);
834 RE(t3
) = RE(cc
[ac
+1]) + RE(cc
[ac
+2]);
835 IM(t3
) = IM(cc
[ac
+1]) + IM(cc
[ac
+2]);
836 RE(t4
) = RE(cc
[ac
+1]) - RE(cc
[ac
+2]);
837 IM(t4
) = IM(cc
[ac
+1]) - IM(cc
[ac
+2]);
838 RE(t5
) = RE(cc
[ac
]) - RE(cc
[ac
+3]);
839 IM(t5
) = IM(cc
[ac
]) - IM(cc
[ac
+3]);
841 RE(ch
[ah
]) = RE(cc
[ac
-1]) + RE(t2
) + RE(t3
);
842 IM(ch
[ah
]) = IM(cc
[ac
-1]) + IM(t2
) + IM(t3
);
844 RE(c2
) = RE(cc
[ac
-1]) + MUL_F(RE(t2
),tr11
) + MUL_F(RE(t3
),tr12
);
845 IM(c2
) = IM(cc
[ac
-1]) + MUL_F(IM(t2
),tr11
) + MUL_F(IM(t3
),tr12
);
846 RE(c3
) = RE(cc
[ac
-1]) + MUL_F(RE(t2
),tr12
) + MUL_F(RE(t3
),tr11
);
847 IM(c3
) = IM(cc
[ac
-1]) + MUL_F(IM(t2
),tr12
) + MUL_F(IM(t3
),tr11
);
849 ComplexMult(&RE(c4
), &RE(c5
),
850 ti12
, ti11
, RE(t5
), RE(t4
));
851 ComplexMult(&IM(c4
), &IM(c5
),
852 ti12
, ti12
, IM(t5
), IM(t4
));
854 RE(ch
[ah
+l1
]) = RE(c2
) + IM(c5
);
855 IM(ch
[ah
+l1
]) = IM(c2
) - RE(c5
);
856 RE(ch
[ah
+2*l1
]) = RE(c3
) + IM(c4
);
857 IM(ch
[ah
+2*l1
]) = IM(c3
) - RE(c4
);
858 RE(ch
[ah
+3*l1
]) = RE(c3
) - IM(c4
);
859 IM(ch
[ah
+3*l1
]) = IM(c3
) + RE(c4
);
860 RE(ch
[ah
+4*l1
]) = RE(c2
) - IM(c5
);
861 IM(ch
[ah
+4*l1
]) = IM(c2
) + RE(c5
);
867 for (k
= 0; k
< l1
; k
++)
869 for (i
= 0; i
< ido
; i
++)
871 ac
= i
+ (k
*5 + 1) * ido
;
874 RE(t2
) = RE(cc
[ac
]) + RE(cc
[ac
+3*ido
]);
875 IM(t2
) = IM(cc
[ac
]) + IM(cc
[ac
+3*ido
]);
876 RE(t3
) = RE(cc
[ac
+ido
]) + RE(cc
[ac
+2*ido
]);
877 IM(t3
) = IM(cc
[ac
+ido
]) + IM(cc
[ac
+2*ido
]);
878 RE(t4
) = RE(cc
[ac
+ido
]) - RE(cc
[ac
+2*ido
]);
879 IM(t4
) = IM(cc
[ac
+ido
]) - IM(cc
[ac
+2*ido
]);
880 RE(t5
) = RE(cc
[ac
]) - RE(cc
[ac
+3*ido
]);
881 IM(t5
) = IM(cc
[ac
]) - IM(cc
[ac
+3*ido
]);
883 RE(ch
[ah
]) = RE(cc
[ac
-ido
]) + RE(t2
) + RE(t3
);
884 IM(ch
[ah
]) = IM(cc
[ac
-ido
]) + IM(t2
) + IM(t3
);
886 RE(c2
) = RE(cc
[ac
-ido
]) + MUL_F(RE(t2
),tr11
) + MUL_F(RE(t3
),tr12
);
887 IM(c2
) = IM(cc
[ac
-ido
]) + MUL_F(IM(t2
),tr11
) + MUL_F(IM(t3
),tr12
);
888 RE(c3
) = RE(cc
[ac
-ido
]) + MUL_F(RE(t2
),tr12
) + MUL_F(RE(t3
),tr11
);
889 IM(c3
) = IM(cc
[ac
-ido
]) + MUL_F(IM(t2
),tr12
) + MUL_F(IM(t3
),tr11
);
891 ComplexMult(&RE(c5
), &RE(c4
),
892 ti11
, ti12
, RE(t5
), RE(t4
));
893 ComplexMult(&IM(c5
), &IM(c4
),
894 ti11
, ti12
, IM(t5
), IM(t4
));
896 IM(d2
) = IM(c2
) + RE(c5
);
897 IM(d3
) = IM(c3
) + RE(c4
);
898 RE(d4
) = RE(c3
) + IM(c4
);
899 RE(d5
) = RE(c2
) + IM(c5
);
900 RE(d2
) = RE(c2
) - IM(c5
);
901 IM(d5
) = IM(c2
) - RE(c5
);
902 RE(d3
) = RE(c3
) - IM(c4
);
903 IM(d4
) = IM(c3
) - RE(c4
);
906 ComplexMult(&IM(ch
[ah
+l1
*ido
]), &RE(ch
[ah
+l1
*ido
]),
907 IM(d2
), RE(d2
), RE(wa1
[i
]), IM(wa1
[i
]));
908 ComplexMult(&IM(ch
[ah
+2*l1
*ido
]), &RE(ch
[ah
+2*l1
*ido
]),
909 IM(d3
), RE(d3
), RE(wa2
[i
]), IM(wa2
[i
]));
910 ComplexMult(&IM(ch
[ah
+3*l1
*ido
]), &RE(ch
[ah
+3*l1
*ido
]),
911 IM(d4
), RE(d4
), RE(wa3
[i
]), IM(wa3
[i
]));
912 ComplexMult(&IM(ch
[ah
+4*l1
*ido
]), &RE(ch
[ah
+4*l1
*ido
]),
913 IM(d5
), RE(d5
), RE(wa4
[i
]), IM(wa4
[i
]));
915 ComplexMult(&RE(ch
[ah
+l1
*ido
]), &IM(ch
[ah
+l1
*ido
]),
916 RE(d2
), IM(d2
), RE(wa1
[i
]), IM(wa1
[i
]));
917 ComplexMult(&RE(ch
[ah
+2*l1
*ido
]), &IM(ch
[ah
+2*l1
*ido
]),
918 RE(d3
), IM(d3
), RE(wa2
[i
]), IM(wa2
[i
]));
919 ComplexMult(&RE(ch
[ah
+3*l1
*ido
]), &IM(ch
[ah
+3*l1
*ido
]),
920 RE(d4
), IM(d4
), RE(wa3
[i
]), IM(wa3
[i
]));
921 ComplexMult(&RE(ch
[ah
+4*l1
*ido
]), &IM(ch
[ah
+4*l1
*ido
]),
922 RE(d5
), IM(d5
), RE(wa4
[i
]), IM(wa4
[i
]));
927 for (k
= 0; k
< l1
; k
++)
929 for (i
= 0; i
< ido
; i
++)
931 ac
= i
+ (k
*5 + 1) * ido
;
934 RE(t2
) = RE(cc
[ac
]) + RE(cc
[ac
+3*ido
]);
935 IM(t2
) = IM(cc
[ac
]) + IM(cc
[ac
+3*ido
]);
936 RE(t3
) = RE(cc
[ac
+ido
]) + RE(cc
[ac
+2*ido
]);
937 IM(t3
) = IM(cc
[ac
+ido
]) + IM(cc
[ac
+2*ido
]);
938 RE(t4
) = RE(cc
[ac
+ido
]) - RE(cc
[ac
+2*ido
]);
939 IM(t4
) = IM(cc
[ac
+ido
]) - IM(cc
[ac
+2*ido
]);
940 RE(t5
) = RE(cc
[ac
]) - RE(cc
[ac
+3*ido
]);
941 IM(t5
) = IM(cc
[ac
]) - IM(cc
[ac
+3*ido
]);
943 RE(ch
[ah
]) = RE(cc
[ac
-ido
]) + RE(t2
) + RE(t3
);
944 IM(ch
[ah
]) = IM(cc
[ac
-ido
]) + IM(t2
) + IM(t3
);
946 RE(c2
) = RE(cc
[ac
-ido
]) + MUL_F(RE(t2
),tr11
) + MUL_F(RE(t3
),tr12
);
947 IM(c2
) = IM(cc
[ac
-ido
]) + MUL_F(IM(t2
),tr11
) + MUL_F(IM(t3
),tr12
);
948 RE(c3
) = RE(cc
[ac
-ido
]) + MUL_F(RE(t2
),tr12
) + MUL_F(RE(t3
),tr11
);
949 IM(c3
) = IM(cc
[ac
-ido
]) + MUL_F(IM(t2
),tr12
) + MUL_F(IM(t3
),tr11
);
951 ComplexMult(&RE(c4
), &RE(c5
),
952 ti12
, ti11
, RE(t5
), RE(t4
));
953 ComplexMult(&IM(c4
), &IM(c5
),
954 ti12
, ti12
, IM(t5
), IM(t4
));
956 IM(d2
) = IM(c2
) - RE(c5
);
957 IM(d3
) = IM(c3
) - RE(c4
);
958 RE(d4
) = RE(c3
) - IM(c4
);
959 RE(d5
) = RE(c2
) - IM(c5
);
960 RE(d2
) = RE(c2
) + IM(c5
);
961 IM(d5
) = IM(c2
) + RE(c5
);
962 RE(d3
) = RE(c3
) + IM(c4
);
963 IM(d4
) = IM(c3
) + RE(c4
);
966 ComplexMult(&RE(ch
[ah
+l1
*ido
]), &IM(ch
[ah
+l1
*ido
]),
967 RE(d2
), IM(d2
), RE(wa1
[i
]), IM(wa1
[i
]));
968 ComplexMult(&RE(ch
[ah
+2*l1
*ido
]), &IM(ch
[ah
+2*l1
*ido
]),
969 RE(d3
), IM(d3
), RE(wa2
[i
]), IM(wa2
[i
]));
970 ComplexMult(&RE(ch
[ah
+3*l1
*ido
]), &IM(ch
[ah
+3*l1
*ido
]),
971 RE(d4
), IM(d4
), RE(wa3
[i
]), IM(wa3
[i
]));
972 ComplexMult(&RE(ch
[ah
+4*l1
*ido
]), &IM(ch
[ah
+4*l1
*ido
]),
973 RE(d5
), IM(d5
), RE(wa4
[i
]), IM(wa4
[i
]));
975 ComplexMult(&IM(ch
[ah
+l1
*ido
]), &RE(ch
[ah
+l1
*ido
]),
976 IM(d2
), RE(d2
), RE(wa1
[i
]), IM(wa1
[i
]));
977 ComplexMult(&IM(ch
[ah
+2*l1
*ido
]), &RE(ch
[ah
+2*l1
*ido
]),
978 IM(d3
), RE(d3
), RE(wa2
[i
]), IM(wa2
[i
]));
979 ComplexMult(&IM(ch
[ah
+3*l1
*ido
]), &RE(ch
[ah
+3*l1
*ido
]),
980 IM(d4
), RE(d4
), RE(wa3
[i
]), IM(wa3
[i
]));
981 ComplexMult(&IM(ch
[ah
+4*l1
*ido
]), &RE(ch
[ah
+4*l1
*ido
]),
982 IM(d5
), RE(d5
), RE(wa4
[i
]), IM(wa4
[i
]));
991 /*----------------------------------------------------------------------
992 cfftf1, cfftf, cfftb, cffti1, cffti. Complex FFTs.
993 ----------------------------------------------------------------------*/
997 #define CONV(A,B,C) ( (A<<2) | ((B & 0x1)<<1) | ((C==1)&0x1) )
999 static INLINE
void cfftf1pos_sse(uint16_t n
, complex_t
*c
, complex_t
*ch
,
1000 const uint16_t *ifac
, const complex_t
*wa
,
1004 uint16_t k1
, l1
, l2
;
1005 uint16_t na
, nf
, ip
, iw
, ix2
, ix3
, ix4
, ido
, idl1
;
1012 for (k1
= 2; k1
<= nf
+1; k1
++)
1023 switch (CONV(ip
,na
,ido
))
1026 //passf4pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
1027 passf4pos((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)c
, ch
, &wa
[iw
], &wa
[ix2
], &wa
[ix3
]);
1030 passf4pos_sse((const uint16_t)l1
, (const complex_t
*)c
, ch
, &wa
[iw
], &wa
[ix2
], &wa
[ix3
]);
1033 passf4pos((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)ch
, c
, &wa
[iw
], &wa
[ix2
], &wa
[ix3
]);
1034 //passf4pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
1037 passf4pos_sse((const uint16_t)l1
, (const complex_t
*)ch
, c
, &wa
[iw
], &wa
[ix2
], &wa
[ix3
]);
1040 passf2pos((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)c
, ch
, &wa
[iw
]);
1041 //passf2pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
1044 passf2pos((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)c
, ch
, &wa
[iw
]);
1045 //passf2pos_sse((const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
1048 passf2pos((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)ch
, c
, &wa
[iw
]);
1049 //passf2pos_sse_ido((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
1052 passf2pos((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)ch
, c
, &wa
[iw
]);
1053 //passf2pos_sse((const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
1057 passf3((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)c
, ch
, &wa
[iw
], &wa
[ix2
], isign
);
1061 passf3((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)ch
, c
, &wa
[iw
], &wa
[ix2
], isign
);
1065 passf5((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)c
, ch
, &wa
[iw
], &wa
[ix2
], &wa
[ix3
], &wa
[ix4
], isign
);
1069 passf5((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)ch
, c
, &wa
[iw
], &wa
[ix2
], &wa
[ix3
], &wa
[ix4
], isign
);
1082 for (i
= 0; i
< n
; i
++)
1084 RE(c
[i
]) = RE(ch
[i
]);
1085 IM(c
[i
]) = IM(ch
[i
]);
1090 static INLINE
void cfftf1pos(uint16_t n
, complex_t
*c
, complex_t
*ch
,
1091 const uint16_t *ifac
, const complex_t
*wa
,
1095 uint16_t k1
, l1
, l2
;
1096 uint16_t na
, nf
, ip
, iw
, ix2
, ix3
, ix4
, ido
, idl1
;
1103 for (k1
= 2; k1
<= nf
+1; k1
++)
1117 passf4pos((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)c
, ch
, &wa
[iw
], &wa
[ix2
], &wa
[ix3
]);
1119 passf4pos((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)ch
, c
, &wa
[iw
], &wa
[ix2
], &wa
[ix3
]);
1125 passf2pos((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)c
, ch
, &wa
[iw
]);
1127 passf2pos((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)ch
, c
, &wa
[iw
]);
1135 passf3((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)c
, ch
, &wa
[iw
], &wa
[ix2
], isign
);
1137 passf3((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)ch
, c
, &wa
[iw
], &wa
[ix2
], isign
);
1147 passf5((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)c
, ch
, &wa
[iw
], &wa
[ix2
], &wa
[ix3
], &wa
[ix4
], isign
);
1149 passf5((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)ch
, c
, &wa
[iw
], &wa
[ix2
], &wa
[ix3
], &wa
[ix4
], isign
);
1162 for (i
= 0; i
< n
; i
++)
1164 RE(c
[i
]) = RE(ch
[i
]);
1165 IM(c
[i
]) = IM(ch
[i
]);
1169 static INLINE
void cfftf1neg(uint16_t n
, complex_t
*c
, complex_t
*ch
,
1170 const uint16_t *ifac
, const complex_t
*wa
,
1174 uint16_t k1
, l1
, l2
;
1175 uint16_t na
, nf
, ip
, iw
, ix2
, ix3
, ix4
, ido
, idl1
;
1182 for (k1
= 2; k1
<= nf
+1; k1
++)
1196 passf4neg((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)c
, ch
, &wa
[iw
], &wa
[ix2
], &wa
[ix3
]);
1198 passf4neg((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)ch
, c
, &wa
[iw
], &wa
[ix2
], &wa
[ix3
]);
1204 passf2neg((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)c
, ch
, &wa
[iw
]);
1206 passf2neg((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)ch
, c
, &wa
[iw
]);
1214 passf3((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)c
, ch
, &wa
[iw
], &wa
[ix2
], isign
);
1216 passf3((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)ch
, c
, &wa
[iw
], &wa
[ix2
], isign
);
1226 passf5((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)c
, ch
, &wa
[iw
], &wa
[ix2
], &wa
[ix3
], &wa
[ix4
], isign
);
1228 passf5((const uint16_t)ido
, (const uint16_t)l1
, (const complex_t
*)ch
, c
, &wa
[iw
], &wa
[ix2
], &wa
[ix3
], &wa
[ix4
], isign
);
1241 for (i
= 0; i
< n
; i
++)
1243 RE(c
[i
]) = RE(ch
[i
]);
1244 IM(c
[i
]) = IM(ch
[i
]);
1248 void cfftf(cfft_info
*cfft
, complex_t
*c
)
1250 cfftf1neg(cfft
->n
, c
, cfft
->work
, (const uint16_t*)cfft
->ifac
, (const complex_t
*)cfft
->tab
, -1);
1253 void cfftb(cfft_info
*cfft
, complex_t
*c
)
1255 cfftf1pos(cfft
->n
, c
, cfft
->work
, (const uint16_t*)cfft
->ifac
, (const complex_t
*)cfft
->tab
, +1);
1259 void cfftb_sse(cfft_info
*cfft
, complex_t
*c
)
1261 cfftf1pos_sse(cfft
->n
, c
, cfft
->work
, (const uint16_t*)cfft
->ifac
, (const complex_t
*)cfft
->tab
, +1);
1265 static void cffti1(uint16_t n
, complex_t
*wa
, uint16_t *ifac
)
1267 static uint16_t ntryh
[4] = {3, 4, 2, 5};
1269 real_t arg
, argh
, argld
, fi
;
1271 uint16_t i1
, k1
, l1
, l2
;
1272 uint16_t ld
, ii
, ip
;
1274 uint16_t ntry
= 0, i
, j
;
1276 uint16_t nf
, nl
, nq
, nr
;
1302 if (ntry
== 2 && nf
!= 1)
1304 for (i
= 2; i
<= nf
; i
++)
1307 ifac
[ib
+1] = ifac
[ib
];
1317 argh
= (real_t
)2.0*(real_t
)M_PI
/ (real_t
)n
;
1321 for (k1
= 1; k1
<= nf
; k1
++)
1329 for (j
= 0; j
< ipm
; j
++)
1338 for (ii
= 0; ii
< ido
; ii
++)
1343 RE(wa
[i
]) = (real_t
)cos(arg
);
1345 IM(wa
[i
]) = (real_t
)sin(arg
);
1347 IM(wa
[i
]) = (real_t
)-sin(arg
);
1353 RE(wa
[i1
]) = RE(wa
[i
]);
1354 IM(wa
[i1
]) = IM(wa
[i
]);
1362 cfft_info
*cffti(uint16_t n
)
1364 cfft_info
*cfft
= (cfft_info
*)faad_malloc(sizeof(cfft_info
));
1367 cfft
->work
= (complex_t
*)faad_malloc(n
*sizeof(complex_t
));
1370 cfft
->tab
= (complex_t
*)faad_malloc(n
*sizeof(complex_t
));
1372 cffti1(n
, cfft
->tab
, cfft
->ifac
);
1374 cffti1(n
, NULL
, cfft
->ifac
);
1378 case 64: cfft
->tab
= (complex_t
*)cfft_tab_64
; break;
1379 case 512: cfft
->tab
= (complex_t
*)cfft_tab_512
; break;
1381 case 256: cfft
->tab
= (complex_t
*)cfft_tab_256
; break;
1384 #ifdef ALLOW_SMALL_FRAMELENGTH
1385 case 60: cfft
->tab
= (complex_t
*)cfft_tab_60
; break;
1386 case 480: cfft
->tab
= (complex_t
*)cfft_tab_480
; break;
1388 case 240: cfft
->tab
= (complex_t
*)cfft_tab_240
; break;
1397 void cfftu(cfft_info
*cfft
)
1399 if (cfft
->work
) faad_free(cfft
->work
);
1401 if (cfft
->tab
) faad_free(cfft
->tab
);
1404 if (cfft
) faad_free(cfft
);