adding the code documentation guide lines
[mplayer/glamo.git] / liba52 / imdct.c
blobd5a3eb6b6a44def0a20498f2ea344b26dcdb9595
1 /*
2 * imdct.c
3 * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
6 * This file is part of a52dec, a free ATSC A-52 stream decoder.
7 * See http://liba52.sourceforge.net/ for updates.
9 * a52dec is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * a52dec is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
24 * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
25 * michael did port them from libac3 (untested, perhaps totally broken)
26 * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
29 #include "config.h"
31 #include <math.h>
32 #include <stdio.h>
33 #ifndef M_PI
34 #define M_PI 3.1415926535897932384626433832795029
35 #endif
36 #include <inttypes.h>
38 #include "a52.h"
39 #include "a52_internal.h"
40 #include "mm_accel.h"
41 #include "mangle.h"
43 #ifdef RUNTIME_CPUDETECT
44 #undef HAVE_3DNOWEX
45 #endif
47 #define USE_AC3_C
49 void (* imdct_256) (sample_t data[], sample_t delay[], sample_t bias);
50 void (* imdct_512) (sample_t data[], sample_t delay[], sample_t bias);
52 typedef struct complex_s {
53 sample_t real;
54 sample_t imag;
55 } complex_t;
57 static void fft_128p(complex_t *a);
59 static const int pm128[128] attribute_used __attribute__((aligned(16))) =
61 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
62 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,
63 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,
64 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,
65 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,
66 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
67 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
68 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
69 };
71 /* 128 point bit-reverse LUT */
72 static uint8_t attribute_used bit_reverse_512[] = {
73 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
74 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
75 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
76 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
77 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
78 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
79 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
80 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
81 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
82 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
83 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
84 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
85 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
86 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
87 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
88 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
90 static uint8_t bit_reverse_256[] = {
91 0x00, 0x20, 0x10, 0x30, 0x08, 0x28, 0x18, 0x38,
92 0x04, 0x24, 0x14, 0x34, 0x0c, 0x2c, 0x1c, 0x3c,
93 0x02, 0x22, 0x12, 0x32, 0x0a, 0x2a, 0x1a, 0x3a,
94 0x06, 0x26, 0x16, 0x36, 0x0e, 0x2e, 0x1e, 0x3e,
95 0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39,
96 0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d,
97 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b,
98 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
100 #ifdef ARCH_X86
101 // NOTE: SSE needs 16byte alignment or it will segfault
103 static complex_t __attribute__((aligned(16))) buf[128];
104 static float __attribute__((aligned(16))) sseSinCos1c[256];
105 static float __attribute__((aligned(16))) sseSinCos1d[256];
106 static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
107 //static float __attribute__((aligned(16))) sseW0[4];
108 static float __attribute__((aligned(16))) sseW1[8];
109 static float __attribute__((aligned(16))) sseW2[16];
110 static float __attribute__((aligned(16))) sseW3[32];
111 static float __attribute__((aligned(16))) sseW4[64];
112 static float __attribute__((aligned(16))) sseW5[128];
113 static float __attribute__((aligned(16))) sseW6[256];
114 static float __attribute__((aligned(16))) *sseW[7]=
115 {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
116 static float __attribute__((aligned(16))) sseWindow[512];
117 #else
118 static complex_t __attribute__((aligned(16))) buf[128];
119 #endif
121 /* Twiddle factor LUT */
122 static complex_t __attribute__((aligned(16))) w_1[1];
123 static complex_t __attribute__((aligned(16))) w_2[2];
124 static complex_t __attribute__((aligned(16))) w_4[4];
125 static complex_t __attribute__((aligned(16))) w_8[8];
126 static complex_t __attribute__((aligned(16))) w_16[16];
127 static complex_t __attribute__((aligned(16))) w_32[32];
128 static complex_t __attribute__((aligned(16))) w_64[64];
129 static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
131 /* Twiddle factors for IMDCT */
132 static sample_t __attribute__((aligned(16))) xcos1[128];
133 static sample_t __attribute__((aligned(16))) xsin1[128];
134 static sample_t __attribute__((aligned(16))) xcos2[64];
135 static sample_t __attribute__((aligned(16))) xsin2[64];
137 /* Windowing function for Modified DCT - Thank you acroread */
138 sample_t imdct_window[] = {
139 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
140 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
141 0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
142 0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
143 0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
144 0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
145 0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
146 0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
147 0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
148 0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
149 0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
150 0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
151 0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
152 0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
153 0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
154 0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
155 0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
156 0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
157 0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
158 0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
159 0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
160 0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
161 0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
162 0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
163 0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
164 0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
165 0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
166 0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
167 0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
168 0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
169 0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
170 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 };
173 static inline void swap_cmplx(complex_t *a, complex_t *b)
175 complex_t tmp;
177 tmp = *a;
178 *a = *b;
179 *b = tmp;
184 static inline complex_t cmplx_mult(complex_t a, complex_t b)
186 complex_t ret;
188 ret.real = a.real * b.real - a.imag * b.imag;
189 ret.imag = a.real * b.imag + a.imag * b.real;
191 return ret;
194 void
195 imdct_do_512(sample_t data[],sample_t delay[], sample_t bias)
197 int i;
198 #ifndef USE_AC3_C
199 int k;
200 int p,q;
201 int m;
202 int two_m;
203 int two_m_plus_one;
205 sample_t tmp_b_i;
206 sample_t tmp_b_r;
207 #endif
208 sample_t tmp_a_i;
209 sample_t tmp_a_r;
211 sample_t *data_ptr;
212 sample_t *delay_ptr;
213 sample_t *window_ptr;
215 /* 512 IMDCT with source and dest data in 'data' */
217 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
218 for( i=0; i < 128; i++) {
219 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
220 #ifdef USE_AC3_C
221 int j= pm128[i];
222 #else
223 int j= bit_reverse_512[i];
224 #endif
225 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
226 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
229 /* FFT Merge */
230 /* unoptimized variant
231 for (m=1; m < 7; m++) {
232 if(m)
233 two_m = (1 << m);
234 else
235 two_m = 1;
237 two_m_plus_one = (1 << (m+1));
239 for(i = 0; i < 128; i += two_m_plus_one) {
240 for(k = 0; k < two_m; k++) {
241 p = k + i;
242 q = p + two_m;
243 tmp_a_r = buf[p].real;
244 tmp_a_i = buf[p].imag;
245 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
246 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
247 buf[p].real = tmp_a_r + tmp_b_r;
248 buf[p].imag = tmp_a_i + tmp_b_i;
249 buf[q].real = tmp_a_r - tmp_b_r;
250 buf[q].imag = tmp_a_i - tmp_b_i;
255 #ifdef USE_AC3_C
256 fft_128p (&buf[0]);
257 #else
259 /* 1. iteration */
260 for(i = 0; i < 128; i += 2) {
261 tmp_a_r = buf[i].real;
262 tmp_a_i = buf[i].imag;
263 tmp_b_r = buf[i+1].real;
264 tmp_b_i = buf[i+1].imag;
265 buf[i].real = tmp_a_r + tmp_b_r;
266 buf[i].imag = tmp_a_i + tmp_b_i;
267 buf[i+1].real = tmp_a_r - tmp_b_r;
268 buf[i+1].imag = tmp_a_i - tmp_b_i;
271 /* 2. iteration */
272 // Note w[1]={{1,0}, {0,-1}}
273 for(i = 0; i < 128; i += 4) {
274 tmp_a_r = buf[i].real;
275 tmp_a_i = buf[i].imag;
276 tmp_b_r = buf[i+2].real;
277 tmp_b_i = buf[i+2].imag;
278 buf[i].real = tmp_a_r + tmp_b_r;
279 buf[i].imag = tmp_a_i + tmp_b_i;
280 buf[i+2].real = tmp_a_r - tmp_b_r;
281 buf[i+2].imag = tmp_a_i - tmp_b_i;
282 tmp_a_r = buf[i+1].real;
283 tmp_a_i = buf[i+1].imag;
284 tmp_b_r = buf[i+3].imag;
285 tmp_b_i = buf[i+3].real;
286 buf[i+1].real = tmp_a_r + tmp_b_r;
287 buf[i+1].imag = tmp_a_i - tmp_b_i;
288 buf[i+3].real = tmp_a_r - tmp_b_r;
289 buf[i+3].imag = tmp_a_i + tmp_b_i;
292 /* 3. iteration */
293 for(i = 0; i < 128; i += 8) {
294 tmp_a_r = buf[i].real;
295 tmp_a_i = buf[i].imag;
296 tmp_b_r = buf[i+4].real;
297 tmp_b_i = buf[i+4].imag;
298 buf[i].real = tmp_a_r + tmp_b_r;
299 buf[i].imag = tmp_a_i + tmp_b_i;
300 buf[i+4].real = tmp_a_r - tmp_b_r;
301 buf[i+4].imag = tmp_a_i - tmp_b_i;
302 tmp_a_r = buf[1+i].real;
303 tmp_a_i = buf[1+i].imag;
304 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
305 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
306 buf[1+i].real = tmp_a_r + tmp_b_r;
307 buf[1+i].imag = tmp_a_i + tmp_b_i;
308 buf[i+5].real = tmp_a_r - tmp_b_r;
309 buf[i+5].imag = tmp_a_i - tmp_b_i;
310 tmp_a_r = buf[i+2].real;
311 tmp_a_i = buf[i+2].imag;
312 tmp_b_r = buf[i+6].imag;
313 tmp_b_i = - buf[i+6].real;
314 buf[i+2].real = tmp_a_r + tmp_b_r;
315 buf[i+2].imag = tmp_a_i + tmp_b_i;
316 buf[i+6].real = tmp_a_r - tmp_b_r;
317 buf[i+6].imag = tmp_a_i - tmp_b_i;
318 tmp_a_r = buf[i+3].real;
319 tmp_a_i = buf[i+3].imag;
320 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
321 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
322 buf[i+3].real = tmp_a_r + tmp_b_r;
323 buf[i+3].imag = tmp_a_i + tmp_b_i;
324 buf[i+7].real = tmp_a_r - tmp_b_r;
325 buf[i+7].imag = tmp_a_i - tmp_b_i;
328 /* 4-7. iterations */
329 for (m=3; m < 7; m++) {
330 two_m = (1 << m);
332 two_m_plus_one = two_m<<1;
334 for(i = 0; i < 128; i += two_m_plus_one) {
335 for(k = 0; k < two_m; k++) {
336 int p = k + i;
337 int q = p + two_m;
338 tmp_a_r = buf[p].real;
339 tmp_a_i = buf[p].imag;
340 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
341 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
342 buf[p].real = tmp_a_r + tmp_b_r;
343 buf[p].imag = tmp_a_i + tmp_b_i;
344 buf[q].real = tmp_a_r - tmp_b_r;
345 buf[q].imag = tmp_a_i - tmp_b_i;
349 #endif
350 /* Post IFFT complex multiply plus IFFT complex conjugate*/
351 for( i=0; i < 128; i++) {
352 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
353 tmp_a_r = buf[i].real;
354 tmp_a_i = -1.0 * buf[i].imag;
355 buf[i].real =(tmp_a_r * xcos1[i]) - (tmp_a_i * xsin1[i]);
356 buf[i].imag =(tmp_a_r * xsin1[i]) + (tmp_a_i * xcos1[i]);
359 data_ptr = data;
360 delay_ptr = delay;
361 window_ptr = imdct_window;
363 /* Window and convert to real valued signal */
364 for(i=0; i< 64; i++) {
365 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
366 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
369 for(i=0; i< 64; i++) {
370 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
371 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
374 /* The trailing edge of the window goes into the delay line */
375 delay_ptr = delay;
377 for(i=0; i< 64; i++) {
378 *delay_ptr++ = -buf[64+i].real * *--window_ptr;
379 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
382 for(i=0; i<64; i++) {
383 *delay_ptr++ = buf[i].imag * *--window_ptr;
384 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
388 #ifdef HAVE_ALTIVEC
390 #ifndef SYS_DARWIN
391 #include <altivec.h>
392 #endif
394 // used to build registers permutation vectors (vcprm)
395 // the 's' are for words in the _s_econd vector
396 #define WORD_0 0x00,0x01,0x02,0x03
397 #define WORD_1 0x04,0x05,0x06,0x07
398 #define WORD_2 0x08,0x09,0x0a,0x0b
399 #define WORD_3 0x0c,0x0d,0x0e,0x0f
400 #define WORD_s0 0x10,0x11,0x12,0x13
401 #define WORD_s1 0x14,0x15,0x16,0x17
402 #define WORD_s2 0x18,0x19,0x1a,0x1b
403 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
405 #ifdef SYS_DARWIN
406 #define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
407 #else
408 #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
409 #endif
411 // vcprmle is used to keep the same index as in the SSE version.
412 // it's the same as vcprm, with the index inversed
413 // ('le' is Little Endian)
414 #define vcprmle(a,b,c,d) vcprm(d,c,b,a)
416 // used to build inverse/identity vectors (vcii)
417 // n is _n_egative, p is _p_ositive
418 #define FLOAT_n -1.
419 #define FLOAT_p 1.
421 #ifdef SYS_DARWIN
422 #define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
423 #else
424 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
425 #endif
427 #ifdef SYS_DARWIN
428 #define FOUROF(a) (a)
429 #else
430 #define FOUROF(a) {a,a,a,a}
431 #endif
434 void
435 imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
437 int i;
438 int k;
439 int p,q;
440 int m;
441 int two_m;
442 int two_m_plus_one;
444 sample_t tmp_b_i;
445 sample_t tmp_b_r;
446 sample_t tmp_a_i;
447 sample_t tmp_a_r;
449 sample_t *data_ptr;
450 sample_t *delay_ptr;
451 sample_t *window_ptr;
453 /* 512 IMDCT with source and dest data in 'data' */
455 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
456 for( i=0; i < 128; i++) {
457 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
458 int j= bit_reverse_512[i];
459 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
460 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
463 /* 1. iteration */
464 for(i = 0; i < 128; i += 2) {
465 #if 0
466 tmp_a_r = buf[i].real;
467 tmp_a_i = buf[i].imag;
468 tmp_b_r = buf[i+1].real;
469 tmp_b_i = buf[i+1].imag;
470 buf[i].real = tmp_a_r + tmp_b_r;
471 buf[i].imag = tmp_a_i + tmp_b_i;
472 buf[i+1].real = tmp_a_r - tmp_b_r;
473 buf[i+1].imag = tmp_a_i - tmp_b_i;
474 #else
475 vector float temp, bufv;
477 bufv = vec_ld(i << 3, (float*)buf);
478 temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
479 bufv = vec_madd(bufv, vcii(p,p,n,n), temp);
480 vec_st(bufv, i << 3, (float*)buf);
481 #endif
484 /* 2. iteration */
485 // Note w[1]={{1,0}, {0,-1}}
486 for(i = 0; i < 128; i += 4) {
487 #if 0
488 tmp_a_r = buf[i].real;
489 tmp_a_i = buf[i].imag;
490 tmp_b_r = buf[i+2].real;
491 tmp_b_i = buf[i+2].imag;
492 buf[i].real = tmp_a_r + tmp_b_r;
493 buf[i].imag = tmp_a_i + tmp_b_i;
494 buf[i+2].real = tmp_a_r - tmp_b_r;
495 buf[i+2].imag = tmp_a_i - tmp_b_i;
496 tmp_a_r = buf[i+1].real;
497 tmp_a_i = buf[i+1].imag;
498 /* WARNING: im <-> re here ! */
499 tmp_b_r = buf[i+3].imag;
500 tmp_b_i = buf[i+3].real;
501 buf[i+1].real = tmp_a_r + tmp_b_r;
502 buf[i+1].imag = tmp_a_i - tmp_b_i;
503 buf[i+3].real = tmp_a_r - tmp_b_r;
504 buf[i+3].imag = tmp_a_i + tmp_b_i;
505 #else
506 vector float buf01, buf23, temp1, temp2;
508 buf01 = vec_ld((i + 0) << 3, (float*)buf);
509 buf23 = vec_ld((i + 2) << 3, (float*)buf);
510 buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
512 temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);
513 temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);
515 vec_st(temp1, (i + 0) << 3, (float*)buf);
516 vec_st(temp2, (i + 2) << 3, (float*)buf);
517 #endif
520 /* 3. iteration */
521 for(i = 0; i < 128; i += 8) {
522 #if 0
523 tmp_a_r = buf[i].real;
524 tmp_a_i = buf[i].imag;
525 tmp_b_r = buf[i+4].real;
526 tmp_b_i = buf[i+4].imag;
527 buf[i].real = tmp_a_r + tmp_b_r;
528 buf[i].imag = tmp_a_i + tmp_b_i;
529 buf[i+4].real = tmp_a_r - tmp_b_r;
530 buf[i+4].imag = tmp_a_i - tmp_b_i;
531 tmp_a_r = buf[1+i].real;
532 tmp_a_i = buf[1+i].imag;
533 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
534 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
535 buf[1+i].real = tmp_a_r + tmp_b_r;
536 buf[1+i].imag = tmp_a_i + tmp_b_i;
537 buf[i+5].real = tmp_a_r - tmp_b_r;
538 buf[i+5].imag = tmp_a_i - tmp_b_i;
539 tmp_a_r = buf[i+2].real;
540 tmp_a_i = buf[i+2].imag;
541 /* WARNING re <-> im & sign */
542 tmp_b_r = buf[i+6].imag;
543 tmp_b_i = - buf[i+6].real;
544 buf[i+2].real = tmp_a_r + tmp_b_r;
545 buf[i+2].imag = tmp_a_i + tmp_b_i;
546 buf[i+6].real = tmp_a_r - tmp_b_r;
547 buf[i+6].imag = tmp_a_i - tmp_b_i;
548 tmp_a_r = buf[i+3].real;
549 tmp_a_i = buf[i+3].imag;
550 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
551 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
552 buf[i+3].real = tmp_a_r + tmp_b_r;
553 buf[i+3].imag = tmp_a_i + tmp_b_i;
554 buf[i+7].real = tmp_a_r - tmp_b_r;
555 buf[i+7].imag = tmp_a_i - tmp_b_i;
556 #else
557 vector float buf01, buf23, buf45, buf67;
559 buf01 = vec_ld((i + 0) << 3, (float*)buf);
560 buf23 = vec_ld((i + 2) << 3, (float*)buf);
562 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
563 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
564 buf[i+5].real = tmp_b_r;
565 buf[i+5].imag = tmp_b_i;
566 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
567 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
568 buf[i+7].real = tmp_b_r;
569 buf[i+7].imag = tmp_b_i;
571 buf23 = vec_ld((i + 2) << 3, (float*)buf);
572 buf45 = vec_ld((i + 4) << 3, (float*)buf);
573 buf67 = vec_ld((i + 6) << 3, (float*)buf);
574 buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
576 vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
577 vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
578 vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
579 vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
580 #endif
583 /* 4-7. iterations */
584 for (m=3; m < 7; m++) {
585 two_m = (1 << m);
587 two_m_plus_one = two_m<<1;
589 for(i = 0; i < 128; i += two_m_plus_one) {
590 for(k = 0; k < two_m; k+=2) {
591 #if 0
592 int p = k + i;
593 int q = p + two_m;
594 tmp_a_r = buf[p].real;
595 tmp_a_i = buf[p].imag;
596 tmp_b_r =
597 buf[q].real * w[m][k].real -
598 buf[q].imag * w[m][k].imag;
599 tmp_b_i =
600 buf[q].imag * w[m][k].real +
601 buf[q].real * w[m][k].imag;
602 buf[p].real = tmp_a_r + tmp_b_r;
603 buf[p].imag = tmp_a_i + tmp_b_i;
604 buf[q].real = tmp_a_r - tmp_b_r;
605 buf[q].imag = tmp_a_i - tmp_b_i;
607 tmp_a_r = buf[(p + 1)].real;
608 tmp_a_i = buf[(p + 1)].imag;
609 tmp_b_r =
610 buf[(q + 1)].real * w[m][(k + 1)].real -
611 buf[(q + 1)].imag * w[m][(k + 1)].imag;
612 tmp_b_i =
613 buf[(q + 1)].imag * w[m][(k + 1)].real +
614 buf[(q + 1)].real * w[m][(k + 1)].imag;
615 buf[(p + 1)].real = tmp_a_r + tmp_b_r;
616 buf[(p + 1)].imag = tmp_a_i + tmp_b_i;
617 buf[(q + 1)].real = tmp_a_r - tmp_b_r;
618 buf[(q + 1)].imag = tmp_a_i - tmp_b_i;
619 #else
620 int p = k + i;
621 int q = p + two_m;
622 vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;
623 const vector float vczero = (const vector float)FOUROF(0.);
624 // first compute buf[q] and buf[q+1]
625 vecq = vec_ld(q << 3, (float*)buf);
626 vecw = vec_ld(0, (float*)&(w[m][k]));
627 temp1 = vec_madd(vecq, vecw, vczero);
628 temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));
629 temp2 = vec_madd(temp2, vecw, vczero);
630 temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));
631 temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));
632 vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
633 // then butterfly with buf[p] and buf[p+1]
634 vecp = vec_ld(p << 3, (float*)buf);
636 temp1 = vec_add(vecp, vecq);
637 temp2 = vec_sub(vecp, vecq);
639 vec_st(temp1, p << 3, (float*)buf);
640 vec_st(temp2, q << 3, (float*)buf);
641 #endif
646 /* Post IFFT complex multiply plus IFFT complex conjugate*/
647 for( i=0; i < 128; i+=4) {
648 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
649 #if 0
650 tmp_a_r = buf[(i + 0)].real;
651 tmp_a_i = -1.0 * buf[(i + 0)].imag;
652 buf[(i + 0)].real =
653 (tmp_a_r * xcos1[(i + 0)]) - (tmp_a_i * xsin1[(i + 0)]);
654 buf[(i + 0)].imag =
655 (tmp_a_r * xsin1[(i + 0)]) + (tmp_a_i * xcos1[(i + 0)]);
657 tmp_a_r = buf[(i + 1)].real;
658 tmp_a_i = -1.0 * buf[(i + 1)].imag;
659 buf[(i + 1)].real =
660 (tmp_a_r * xcos1[(i + 1)]) - (tmp_a_i * xsin1[(i + 1)]);
661 buf[(i + 1)].imag =
662 (tmp_a_r * xsin1[(i + 1)]) + (tmp_a_i * xcos1[(i + 1)]);
664 tmp_a_r = buf[(i + 2)].real;
665 tmp_a_i = -1.0 * buf[(i + 2)].imag;
666 buf[(i + 2)].real =
667 (tmp_a_r * xcos1[(i + 2)]) - (tmp_a_i * xsin1[(i + 2)]);
668 buf[(i + 2)].imag =
669 (tmp_a_r * xsin1[(i + 2)]) + (tmp_a_i * xcos1[(i + 2)]);
671 tmp_a_r = buf[(i + 3)].real;
672 tmp_a_i = -1.0 * buf[(i + 3)].imag;
673 buf[(i + 3)].real =
674 (tmp_a_r * xcos1[(i + 3)]) - (tmp_a_i * xsin1[(i + 3)]);
675 buf[(i + 3)].imag =
676 (tmp_a_r * xsin1[(i + 3)]) + (tmp_a_i * xcos1[(i + 3)]);
677 #else
678 vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;
679 vector float temp0022, temp1133, tempCS01;
680 const vector float vczero = (const vector float)FOUROF(0.);
682 bufv_0 = vec_ld((i + 0) << 3, (float*)buf);
683 bufv_2 = vec_ld((i + 2) << 3, (float*)buf);
685 cosv = vec_ld(i << 2, xcos1);
686 sinv = vec_ld(i << 2, xsin1);
688 temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));
689 temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));
690 tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));
691 temp1 = vec_madd(temp0022, tempCS01, vczero);
692 tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
693 temp2 = vec_madd(temp1133, tempCS01, vczero);
694 bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
696 vec_st(bufv_0, (i + 0) << 3, (float*)buf);
698 /* idem with bufv_2 and high-order cosv/sinv */
700 temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));
701 temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));
702 tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));
703 temp1 = vec_madd(temp0022, tempCS01, vczero);
704 tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));
705 temp2 = vec_madd(temp1133, tempCS01, vczero);
706 bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
708 vec_st(bufv_2, (i + 2) << 3, (float*)buf);
710 #endif
713 data_ptr = data;
714 delay_ptr = delay;
715 window_ptr = imdct_window;
717 /* Window and convert to real valued signal */
718 for(i=0; i< 64; i++) {
719 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
720 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
723 for(i=0; i< 64; i++) {
724 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
725 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
728 /* The trailing edge of the window goes into the delay line */
729 delay_ptr = delay;
731 for(i=0; i< 64; i++) {
732 *delay_ptr++ = -buf[64+i].real * *--window_ptr;
733 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
736 for(i=0; i<64; i++) {
737 *delay_ptr++ = buf[i].imag * *--window_ptr;
738 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
741 #endif
744 // Stuff below this line is borrowed from libac3
745 #include "srfftp.h"
746 #ifdef ARCH_X86
747 #ifndef HAVE_3DNOW
748 #define HAVE_3DNOW 1
749 #endif
750 #include "srfftp_3dnow.h"
752 const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
753 const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
754 const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
756 #undef HAVE_3DNOWEX
757 #include "imdct_3dnow.h"
758 #define HAVE_3DNOWEX
759 #include "imdct_3dnow.h"
761 void
762 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
764 /* int i,k;
765 int p,q;*/
766 int m;
767 int two_m;
768 int two_m_plus_one;
770 /* sample_t tmp_a_i;
771 sample_t tmp_a_r;
772 sample_t tmp_b_i;
773 sample_t tmp_b_r;*/
775 sample_t *data_ptr;
776 sample_t *delay_ptr;
777 sample_t *window_ptr;
779 /* 512 IMDCT with source and dest data in 'data' */
780 /* see the c version (dct_do_512()), its allmost identical, just in C */
782 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
783 /* Bit reversed shuffling */
784 asm volatile(
785 "xorl %%esi, %%esi \n\t"
786 "leal "MANGLE(bit_reverse_512)", %%eax \n\t"
787 "movl $1008, %%edi \n\t"
788 "pushl %%ebp \n\t" //use ebp without telling gcc
789 ".balign 16 \n\t"
790 "1: \n\t"
791 "movlps (%0, %%esi), %%xmm0 \n\t" // XXXI
792 "movhps 8(%0, %%edi), %%xmm0 \n\t" // RXXI
793 "movlps 8(%0, %%esi), %%xmm1 \n\t" // XXXi
794 "movhps (%0, %%edi), %%xmm1 \n\t" // rXXi
795 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR
796 "movaps "MANGLE(sseSinCos1c)"(%%esi), %%xmm2\n\t"
797 "mulps %%xmm0, %%xmm2 \n\t"
798 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI
799 "mulps "MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"
800 "subps %%xmm0, %%xmm2 \n\t"
801 "movzbl (%%eax), %%edx \n\t"
802 "movzbl 1(%%eax), %%ebp \n\t"
803 "movlps %%xmm2, (%1, %%edx,8) \n\t"
804 "movhps %%xmm2, (%1, %%ebp,8) \n\t"
805 "addl $16, %%esi \n\t"
806 "addl $2, %%eax \n\t" // avoid complex addressing for P4 crap
807 "subl $16, %%edi \n\t"
808 " jnc 1b \n\t"
809 "popl %%ebp \n\t"//no we didnt touch ebp *g*
810 :: "b" (data), "c" (buf)
811 : "%esi", "%edi", "%eax", "%edx"
815 /* FFT Merge */
816 /* unoptimized variant
817 for (m=1; m < 7; m++) {
818 if(m)
819 two_m = (1 << m);
820 else
821 two_m = 1;
823 two_m_plus_one = (1 << (m+1));
825 for(i = 0; i < 128; i += two_m_plus_one) {
826 for(k = 0; k < two_m; k++) {
827 p = k + i;
828 q = p + two_m;
829 tmp_a_r = buf[p].real;
830 tmp_a_i = buf[p].imag;
831 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
832 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
833 buf[p].real = tmp_a_r + tmp_b_r;
834 buf[p].imag = tmp_a_i + tmp_b_i;
835 buf[q].real = tmp_a_r - tmp_b_r;
836 buf[q].imag = tmp_a_i - tmp_b_i;
842 /* 1. iteration */
843 // Note w[0][0]={1,0}
844 asm volatile(
845 "xorps %%xmm1, %%xmm1 \n\t"
846 "xorps %%xmm2, %%xmm2 \n\t"
847 "movl %0, %%esi \n\t"
848 ".balign 16 \n\t"
849 "1: \n\t"
850 "movlps (%%esi), %%xmm0 \n\t" //buf[p]
851 "movlps 8(%%esi), %%xmm1\n\t" //buf[q]
852 "movhps (%%esi), %%xmm0 \n\t" //buf[p]
853 "movhps 8(%%esi), %%xmm2\n\t" //buf[q]
854 "addps %%xmm1, %%xmm0 \n\t"
855 "subps %%xmm2, %%xmm0 \n\t"
856 "movaps %%xmm0, (%%esi) \n\t"
857 "addl $16, %%esi \n\t"
858 "cmpl %1, %%esi \n\t"
859 " jb 1b \n\t"
860 :: "g" (buf), "r" (buf + 128)
861 : "%esi"
864 /* 2. iteration */
865 // Note w[1]={{1,0}, {0,-1}}
866 asm volatile(
867 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
868 "movl %0, %%esi \n\t"
869 ".balign 16 \n\t"
870 "1: \n\t"
871 "movaps 16(%%esi), %%xmm2 \n\t" //r2,i2,r3,i3
872 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3
873 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3
874 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1
875 "movaps (%%esi), %%xmm1 \n\t" //r0,i0,r1,i1
876 "addps %%xmm2, %%xmm0 \n\t"
877 "subps %%xmm2, %%xmm1 \n\t"
878 "movaps %%xmm0, (%%esi) \n\t"
879 "movaps %%xmm1, 16(%%esi) \n\t"
880 "addl $32, %%esi \n\t"
881 "cmpl %1, %%esi \n\t"
882 " jb 1b \n\t"
883 :: "g" (buf), "r" (buf + 128)
884 : "%esi"
887 /* 3. iteration */
889 Note sseW2+0={1,1,sqrt(2),sqrt(2))
890 Note sseW2+16={0,0,sqrt(2),-sqrt(2))
891 Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
892 Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
894 asm volatile(
895 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
896 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
897 "xorps %%xmm5, %%xmm5 \n\t"
898 "xorps %%xmm2, %%xmm2 \n\t"
899 "movl %0, %%esi \n\t"
900 ".balign 16 \n\t"
901 "1: \n\t"
902 "movaps 32(%%esi), %%xmm2 \n\t" //r4,i4,r5,i5
903 "movaps 48(%%esi), %%xmm3 \n\t" //r6,i6,r7,i7
904 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
905 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
906 "mulps %%xmm2, %%xmm4 \n\t"
907 "mulps %%xmm3, %%xmm5 \n\t"
908 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5
909 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7
910 "mulps %%xmm6, %%xmm3 \n\t"
911 "mulps %%xmm7, %%xmm2 \n\t"
912 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1
913 "movaps 16(%%esi), %%xmm1 \n\t" //r2,i2,r3,i3
914 "addps %%xmm4, %%xmm2 \n\t"
915 "addps %%xmm5, %%xmm3 \n\t"
916 "movaps %%xmm2, %%xmm4 \n\t"
917 "movaps %%xmm3, %%xmm5 \n\t"
918 "addps %%xmm0, %%xmm2 \n\t"
919 "addps %%xmm1, %%xmm3 \n\t"
920 "subps %%xmm4, %%xmm0 \n\t"
921 "subps %%xmm5, %%xmm1 \n\t"
922 "movaps %%xmm2, (%%esi) \n\t"
923 "movaps %%xmm3, 16(%%esi) \n\t"
924 "movaps %%xmm0, 32(%%esi) \n\t"
925 "movaps %%xmm1, 48(%%esi) \n\t"
926 "addl $64, %%esi \n\t"
927 "cmpl %1, %%esi \n\t"
928 " jb 1b \n\t"
929 :: "g" (buf), "r" (buf + 128)
930 : "%esi"
933 /* 4-7. iterations */
934 for (m=3; m < 7; m++) {
935 two_m = (1 << m);
936 two_m_plus_one = two_m<<1;
937 asm volatile(
938 "movl %0, %%esi \n\t"
939 ".balign 16 \n\t"
940 "1: \n\t"
941 "xorl %%edi, %%edi \n\t" // k
942 "leal (%%esi, %3), %%edx \n\t"
943 "2: \n\t"
944 "movaps (%%edx, %%edi), %%xmm1 \n\t"
945 "movaps (%4, %%edi, 2), %%xmm2 \n\t"
946 "mulps %%xmm1, %%xmm2 \n\t"
947 "shufps $0xB1, %%xmm1, %%xmm1 \n\t"
948 "mulps 16(%4, %%edi, 2), %%xmm1 \n\t"
949 "movaps (%%esi, %%edi), %%xmm0 \n\t"
950 "addps %%xmm2, %%xmm1 \n\t"
951 "movaps %%xmm1, %%xmm2 \n\t"
952 "addps %%xmm0, %%xmm1 \n\t"
953 "subps %%xmm2, %%xmm0 \n\t"
954 "movaps %%xmm1, (%%esi, %%edi) \n\t"
955 "movaps %%xmm0, (%%edx, %%edi) \n\t"
956 "addl $16, %%edi \n\t"
957 "cmpl %3, %%edi \n\t" //FIXME (opt) count against 0
958 " jb 2b \n\t"
959 "addl %2, %%esi \n\t"
960 "cmpl %1, %%esi \n\t"
961 " jb 1b \n\t"
962 :: "g" (buf), "m" (buf+128), "m" (two_m_plus_one<<3), "r" (two_m<<3),
963 "r" (sseW[m])
964 : "%esi", "%edi", "%edx"
968 /* Post IFFT complex multiply plus IFFT complex conjugate*/
969 asm volatile(
970 "movl $-1024, %%esi \n\t"
971 ".balign 16 \n\t"
972 "1: \n\t"
973 "movaps (%0, %%esi), %%xmm0 \n\t"
974 "movaps (%0, %%esi), %%xmm1 \n\t"
975 "shufps $0xB1, %%xmm0, %%xmm0 \n\t"
976 "mulps 1024+"MANGLE(sseSinCos1c)"(%%esi), %%xmm1\n\t"
977 "mulps 1024+"MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"
978 "addps %%xmm1, %%xmm0 \n\t"
979 "movaps %%xmm0, (%0, %%esi) \n\t"
980 "addl $16, %%esi \n\t"
981 " jnz 1b \n\t"
982 :: "r" (buf+128)
983 : "%esi"
987 data_ptr = data;
988 delay_ptr = delay;
989 window_ptr = imdct_window;
991 /* Window and convert to real valued signal */
992 asm volatile(
993 "xorl %%edi, %%edi \n\t" // 0
994 "xorl %%esi, %%esi \n\t" // 0
995 "movss %3, %%xmm2 \n\t" // bias
996 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
997 ".balign 16 \n\t"
998 "1: \n\t"
999 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ?
1000 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ?
1001 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ?
1002 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ?
1003 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
1004 "mulps "MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
1005 "addps (%2, %%esi), %%xmm0 \n\t"
1006 "addps %%xmm2, %%xmm0 \n\t"
1007 "movaps %%xmm0, (%1, %%esi) \n\t"
1008 "addl $16, %%esi \n\t"
1009 "subl $16, %%edi \n\t"
1010 "cmpl $512, %%esi \n\t"
1011 " jb 1b \n\t"
1012 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1013 : "%esi", "%edi"
1015 data_ptr+=128;
1016 delay_ptr+=128;
1017 // window_ptr+=128;
1019 asm volatile(
1020 "movl $1024, %%edi \n\t" // 512
1021 "xorl %%esi, %%esi \n\t" // 0
1022 "movss %3, %%xmm2 \n\t" // bias
1023 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
1024 ".balign 16 \n\t"
1025 "1: \n\t"
1026 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A
1027 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C
1028 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C
1029 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A
1030 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
1031 "mulps 512+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
1032 "addps (%2, %%esi), %%xmm0 \n\t"
1033 "addps %%xmm2, %%xmm0 \n\t"
1034 "movaps %%xmm0, (%1, %%esi) \n\t"
1035 "addl $16, %%esi \n\t"
1036 "subl $16, %%edi \n\t"
1037 "cmpl $512, %%esi \n\t"
1038 " jb 1b \n\t"
1039 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1040 : "%esi", "%edi"
1042 data_ptr+=128;
1043 // window_ptr+=128;
1045 /* The trailing edge of the window goes into the delay line */
1046 delay_ptr = delay;
1048 asm volatile(
1049 "xorl %%edi, %%edi \n\t" // 0
1050 "xorl %%esi, %%esi \n\t" // 0
1051 ".balign 16 \n\t"
1052 "1: \n\t"
1053 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A
1054 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C
1055 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C
1056 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A
1057 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
1058 "mulps 1024+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
1059 "movaps %%xmm0, (%1, %%esi) \n\t"
1060 "addl $16, %%esi \n\t"
1061 "subl $16, %%edi \n\t"
1062 "cmpl $512, %%esi \n\t"
1063 " jb 1b \n\t"
1064 :: "r" (buf+64), "r" (delay_ptr)
1065 : "%esi", "%edi"
1067 delay_ptr+=128;
1068 // window_ptr-=128;
1070 asm volatile(
1071 "movl $1024, %%edi \n\t" // 1024
1072 "xorl %%esi, %%esi \n\t" // 0
1073 ".balign 16 \n\t"
1074 "1: \n\t"
1075 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ?
1076 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ?
1077 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ?
1078 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ?
1079 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
1080 "mulps 1536+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
1081 "movaps %%xmm0, (%1, %%esi) \n\t"
1082 "addl $16, %%esi \n\t"
1083 "subl $16, %%edi \n\t"
1084 "cmpl $512, %%esi \n\t"
1085 " jb 1b \n\t"
1086 :: "r" (buf), "r" (delay_ptr)
1087 : "%esi", "%edi"
1090 #endif //arch_x86
1092 void
1093 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias)
1095 int i,k;
1096 int p,q;
1097 int m;
1098 int two_m;
1099 int two_m_plus_one;
1101 sample_t tmp_a_i;
1102 sample_t tmp_a_r;
1103 sample_t tmp_b_i;
1104 sample_t tmp_b_r;
1106 sample_t *data_ptr;
1107 sample_t *delay_ptr;
1108 sample_t *window_ptr;
1110 complex_t *buf_1, *buf_2;
1112 buf_1 = &buf[0];
1113 buf_2 = &buf[64];
1115 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
1116 for(k=0; k<64; k++) {
1117 /* X1[k] = X[2*k] */
1118 /* X2[k] = X[2*k+1] */
1120 p = 2 * (128-2*k-1);
1121 q = 2 * (2 * k);
1123 /* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
1124 buf_1[k].real = data[p] * xcos2[k] - data[q] * xsin2[k];
1125 buf_1[k].imag = -1.0f * (data[q] * xcos2[k] + data[p] * xsin2[k]);
1126 /* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
1127 buf_2[k].real = data[p + 1] * xcos2[k] - data[q + 1] * xsin2[k];
1128 buf_2[k].imag = -1.0f * ( data[q + 1] * xcos2[k] + data[p + 1] * xsin2[k]);
1131 /* IFFT Bit reversed shuffling */
1132 for(i=0; i<64; i++) {
1133 k = bit_reverse_256[i];
1134 if (k < i) {
1135 swap_cmplx(&buf_1[i],&buf_1[k]);
1136 swap_cmplx(&buf_2[i],&buf_2[k]);
1140 /* FFT Merge */
1141 for (m=0; m < 6; m++) {
1142 two_m = (1 << m);
1143 two_m_plus_one = (1 << (m+1));
1145 /* FIXME */
1146 if(m)
1147 two_m = (1 << m);
1148 else
1149 two_m = 1;
1151 for(k = 0; k < two_m; k++) {
1152 for(i = 0; i < 64; i += two_m_plus_one) {
1153 p = k + i;
1154 q = p + two_m;
1155 /* Do block 1 */
1156 tmp_a_r = buf_1[p].real;
1157 tmp_a_i = buf_1[p].imag;
1158 tmp_b_r = buf_1[q].real * w[m][k].real - buf_1[q].imag * w[m][k].imag;
1159 tmp_b_i = buf_1[q].imag * w[m][k].real + buf_1[q].real * w[m][k].imag;
1160 buf_1[p].real = tmp_a_r + tmp_b_r;
1161 buf_1[p].imag = tmp_a_i + tmp_b_i;
1162 buf_1[q].real = tmp_a_r - tmp_b_r;
1163 buf_1[q].imag = tmp_a_i - tmp_b_i;
1165 /* Do block 2 */
1166 tmp_a_r = buf_2[p].real;
1167 tmp_a_i = buf_2[p].imag;
1168 tmp_b_r = buf_2[q].real * w[m][k].real - buf_2[q].imag * w[m][k].imag;
1169 tmp_b_i = buf_2[q].imag * w[m][k].real + buf_2[q].real * w[m][k].imag;
1170 buf_2[p].real = tmp_a_r + tmp_b_r;
1171 buf_2[p].imag = tmp_a_i + tmp_b_i;
1172 buf_2[q].real = tmp_a_r - tmp_b_r;
1173 buf_2[q].imag = tmp_a_i - tmp_b_i;
1178 /* Post IFFT complex multiply */
1179 for( i=0; i < 64; i++) {
1180 /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
1181 tmp_a_r = buf_1[i].real;
1182 tmp_a_i = -buf_1[i].imag;
1183 buf_1[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
1184 buf_1[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
1185 /* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */
1186 tmp_a_r = buf_2[i].real;
1187 tmp_a_i = -buf_2[i].imag;
1188 buf_2[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
1189 buf_2[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
1192 data_ptr = data;
1193 delay_ptr = delay;
1194 window_ptr = imdct_window;
1196 /* Window and convert to real valued signal */
1197 for(i=0; i< 64; i++) {
1198 *data_ptr++ = -buf_1[i].imag * *window_ptr++ + *delay_ptr++ + bias;
1199 *data_ptr++ = buf_1[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
1202 for(i=0; i< 64; i++) {
1203 *data_ptr++ = -buf_1[i].real * *window_ptr++ + *delay_ptr++ + bias;
1204 *data_ptr++ = buf_1[64-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
1207 delay_ptr = delay;
1209 for(i=0; i< 64; i++) {
1210 *delay_ptr++ = -buf_2[i].real * *--window_ptr;
1211 *delay_ptr++ = buf_2[64-i-1].imag * *--window_ptr;
1214 for(i=0; i< 64; i++) {
1215 *delay_ptr++ = buf_2[i].imag * *--window_ptr;
1216 *delay_ptr++ = -buf_2[64-i-1].real * *--window_ptr;
1220 void imdct_init (uint32_t mm_accel)
1222 #ifdef LIBA52_MLIB
1223 if (mm_accel & MM_ACCEL_MLIB) {
1224 fprintf (stderr, "Using mlib for IMDCT transform\n");
1225 imdct_512 = imdct_do_512_mlib;
1226 imdct_256 = imdct_do_256_mlib;
1227 } else
1228 #endif
1230 int i, j, k;
1232 /* Twiddle factors to turn IFFT into IMDCT */
1233 for (i = 0; i < 128; i++) {
1234 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
1235 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
1237 #ifdef ARCH_X86
1238 for (i = 0; i < 128; i++) {
1239 sseSinCos1c[2*i+0]= xcos1[i];
1240 sseSinCos1c[2*i+1]= -xcos1[i];
1241 sseSinCos1d[2*i+0]= xsin1[i];
1242 sseSinCos1d[2*i+1]= xsin1[i];
1244 #endif
1246 /* More twiddle factors to turn IFFT into IMDCT */
1247 for (i = 0; i < 64; i++) {
1248 xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1));
1249 xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1));
1252 for (i = 0; i < 7; i++) {
1253 j = 1 << i;
1254 for (k = 0; k < j; k++) {
1255 w[i][k].real = cos (-M_PI * k / j);
1256 w[i][k].imag = sin (-M_PI * k / j);
1259 #ifdef ARCH_X86
1260 for (i = 1; i < 7; i++) {
1261 j = 1 << i;
1262 for (k = 0; k < j; k+=2) {
1264 sseW[i][4*k + 0] = w[i][k+0].real;
1265 sseW[i][4*k + 1] = w[i][k+0].real;
1266 sseW[i][4*k + 2] = w[i][k+1].real;
1267 sseW[i][4*k + 3] = w[i][k+1].real;
1269 sseW[i][4*k + 4] = -w[i][k+0].imag;
1270 sseW[i][4*k + 5] = w[i][k+0].imag;
1271 sseW[i][4*k + 6] = -w[i][k+1].imag;
1272 sseW[i][4*k + 7] = w[i][k+1].imag;
1274 //we multiply more or less uninitalized numbers so we need to use exactly 0.0
1275 if(k==0)
1277 // sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
1278 sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
1281 if(2*k == j)
1283 sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
1284 // sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
1289 for(i=0; i<128; i++)
1291 sseWindow[2*i+0]= -imdct_window[2*i+0];
1292 sseWindow[2*i+1]= imdct_window[2*i+1];
1295 for(i=0; i<64; i++)
1297 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1];
1298 sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0];
1299 sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1];
1300 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0];
1302 #endif // arch_x86
1304 imdct_512 = imdct_do_512;
1305 #ifdef ARCH_X86
1306 if(mm_accel & MM_ACCEL_X86_SSE)
1308 fprintf (stderr, "Using SSE optimized IMDCT transform\n");
1309 imdct_512 = imdct_do_512_sse;
1311 else
1312 if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
1314 fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
1315 imdct_512 = imdct_do_512_3dnowex;
1317 else
1318 if(mm_accel & MM_ACCEL_X86_3DNOW)
1320 fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
1321 imdct_512 = imdct_do_512_3dnow;
1323 else
1324 #endif // arch_x86
1325 #ifdef HAVE_ALTIVEC
1326 if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
1328 fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
1329 imdct_512 = imdct_do_512_altivec;
1331 else
1332 #endif
1333 fprintf (stderr, "No accelerated IMDCT transform found\n");
1334 imdct_256 = imdct_do_256;
1338 static void fft_asmb(int k, complex_t *x, complex_t *wTB,
1339 const complex_t *d, const complex_t *d_3)
1341 register complex_t *x2k, *x3k, *x4k, *wB;
1342 register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
1344 x2k = x + 2 * k;
1345 x3k = x2k + 2 * k;
1346 x4k = x3k + 2 * k;
1347 wB = wTB + 2 * k;
1349 TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
1350 TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
1352 --k;
1353 for(;;) {
1354 TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]);
1355 TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]);
1356 if (!--k) break;
1357 x += 2;
1358 x2k += 2;
1359 x3k += 2;
1360 x4k += 2;
1361 d += 2;
1362 d_3 += 2;
1363 wTB += 2;
1364 wB += 2;
1369 static void fft_asmb16(complex_t *x, complex_t *wTB)
1371 register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
1372 int k = 2;
1374 /* transform x[0], x[8], x[4], x[12] */
1375 TRANSZERO(x[0],x[4],x[8],x[12]);
1377 /* transform x[1], x[9], x[5], x[13] */
1378 TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]);
1380 /* transform x[2], x[10], x[6], x[14] */
1381 TRANSHALF_16(x[2],x[6],x[10],x[14]);
1383 /* transform x[3], x[11], x[7], x[15] */
1384 TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]);
1388 static void fft_4(complex_t *x)
1390 /* delta_p = 1 here */
1391 /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
1394 register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
1396 yt_r = x[0].real;
1397 yb_r = yt_r - x[2].real;
1398 yt_r += x[2].real;
1400 u_r = x[1].real;
1401 vi_i = x[3].real - u_r;
1402 u_r += x[3].real;
1404 u_i = x[1].imag;
1405 vi_r = u_i - x[3].imag;
1406 u_i += x[3].imag;
1408 yt_i = yt_r;
1409 yt_i += u_r;
1410 x[0].real = yt_i;
1411 yt_r -= u_r;
1412 x[2].real = yt_r;
1413 yt_i = yb_r;
1414 yt_i += vi_r;
1415 x[1].real = yt_i;
1416 yb_r -= vi_r;
1417 x[3].real = yb_r;
1419 yt_i = x[0].imag;
1420 yb_i = yt_i - x[2].imag;
1421 yt_i += x[2].imag;
1423 yt_r = yt_i;
1424 yt_r += u_i;
1425 x[0].imag = yt_r;
1426 yt_i -= u_i;
1427 x[2].imag = yt_i;
1428 yt_r = yb_i;
1429 yt_r += vi_i;
1430 x[1].imag = yt_r;
1431 yb_i -= vi_i;
1432 x[3].imag = yb_i;
1436 static void fft_8(complex_t *x)
1438 /* delta_p = diag{1, sqrt(i)} here */
1439 /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8}
1441 register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
1443 wT1_r = x[1].real;
1444 wT1_i = x[1].imag;
1445 wB1_r = x[3].real;
1446 wB1_i = x[3].imag;
1448 x[1] = x[2];
1449 x[2] = x[4];
1450 x[3] = x[6];
1451 fft_4(&x[0]);
1454 /* x[0] x[4] */
1455 wT2_r = x[5].real;
1456 wT2_r += x[7].real;
1457 wT2_r += wT1_r;
1458 wT2_r += wB1_r;
1459 wT2_i = wT2_r;
1460 wT2_r += x[0].real;
1461 wT2_i = x[0].real - wT2_i;
1462 x[0].real = wT2_r;
1463 x[4].real = wT2_i;
1465 wT2_i = x[5].imag;
1466 wT2_i += x[7].imag;
1467 wT2_i += wT1_i;
1468 wT2_i += wB1_i;
1469 wT2_r = wT2_i;
1470 wT2_r += x[0].imag;
1471 wT2_i = x[0].imag - wT2_i;
1472 x[0].imag = wT2_r;
1473 x[4].imag = wT2_i;
1475 /* x[2] x[6] */
1476 wT2_r = x[5].imag;
1477 wT2_r -= x[7].imag;
1478 wT2_r += wT1_i;
1479 wT2_r -= wB1_i;
1480 wT2_i = wT2_r;
1481 wT2_r += x[2].real;
1482 wT2_i = x[2].real - wT2_i;
1483 x[2].real = wT2_r;
1484 x[6].real = wT2_i;
1486 wT2_i = x[5].real;
1487 wT2_i -= x[7].real;
1488 wT2_i += wT1_r;
1489 wT2_i -= wB1_r;
1490 wT2_r = wT2_i;
1491 wT2_r += x[2].imag;
1492 wT2_i = x[2].imag - wT2_i;
1493 x[2].imag = wT2_i;
1494 x[6].imag = wT2_r;
1497 /* x[1] x[5] */
1498 wT2_r = wT1_r;
1499 wT2_r += wB1_i;
1500 wT2_r -= x[5].real;
1501 wT2_r -= x[7].imag;
1502 wT2_i = wT1_i;
1503 wT2_i -= wB1_r;
1504 wT2_i -= x[5].imag;
1505 wT2_i += x[7].real;
1507 wB2_r = wT2_r;
1508 wB2_r += wT2_i;
1509 wT2_i -= wT2_r;
1510 wB2_r *= HSQRT2;
1511 wT2_i *= HSQRT2;
1512 wT2_r = wB2_r;
1513 wB2_r += x[1].real;
1514 wT2_r = x[1].real - wT2_r;
1516 wB2_i = x[5].real;
1517 x[1].real = wB2_r;
1518 x[5].real = wT2_r;
1520 wT2_r = wT2_i;
1521 wT2_r += x[1].imag;
1522 wT2_i = x[1].imag - wT2_i;
1523 wB2_r = x[5].imag;
1524 x[1].imag = wT2_r;
1525 x[5].imag = wT2_i;
1527 /* x[3] x[7] */
1528 wT1_r -= wB1_i;
1529 wT1_i += wB1_r;
1530 wB1_r = wB2_i - x[7].imag;
1531 wB1_i = wB2_r + x[7].real;
1532 wT1_r -= wB1_r;
1533 wT1_i -= wB1_i;
1534 wB1_r = wT1_r + wT1_i;
1535 wB1_r *= HSQRT2;
1536 wT1_i -= wT1_r;
1537 wT1_i *= HSQRT2;
1538 wB2_r = x[3].real;
1539 wB2_i = wB2_r + wT1_i;
1540 wB2_r -= wT1_i;
1541 x[3].real = wB2_i;
1542 x[7].real = wB2_r;
1543 wB2_i = x[3].imag;
1544 wB2_r = wB2_i + wB1_r;
1545 wB2_i -= wB1_r;
1546 x[3].imag = wB2_i;
1547 x[7].imag = wB2_r;
1551 static void fft_128p(complex_t *a)
1553 fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]);
1554 fft_asmb16(&a[0], &a[8]);
1556 fft_8(&a[16]), fft_8(&a[24]);
1557 fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
1559 fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]);
1560 fft_asmb16(&a[32], &a[40]);
1562 fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]);
1563 fft_asmb16(&a[48], &a[56]);
1565 fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
1567 fft_8(&a[64]); fft_4(&a[72]); fft_4(&a[76]);
1568 /* fft_16(&a[64]); */
1569 fft_asmb16(&a[64], &a[72]);
1571 fft_8(&a[80]); fft_8(&a[88]);
1573 /* fft_32(&a[64]); */
1574 fft_asmb(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
1576 fft_8(&a[96]); fft_4(&a[104]), fft_4(&a[108]);
1577 /* fft_16(&a[96]); */
1578 fft_asmb16(&a[96], &a[104]);
1580 fft_8(&a[112]), fft_8(&a[120]);
1581 /* fft_32(&a[96]); */
1582 fft_asmb(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
1584 /* fft_128(&a[0]); */
1585 fft_asmb(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);