obvious gcc warning fix, approved by Nico
[mplayer/glamo.git] / liba52 / imdct.c
blob67deab8904ea8393d320b22015593da53a7e078d
1 /*
2 * imdct.c
3 * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
6 * This file is part of a52dec, a free ATSC A-52 stream decoder.
7 * See http://liba52.sourceforge.net/ for updates.
9 * Modified for use with MPlayer, changes contained in liba52_changes.diff.
10 * detailed CVS changelog at http://www.mplayerhq.hu/cgi-bin/cvsweb.cgi/main/
11 * $Id$
13 * a52dec is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * a52dec is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
28 * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
29 * michael did port them from libac3 (untested, perhaps totally broken)
30 * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
33 #include "config.h"
35 #include <math.h>
36 #include <stdio.h>
37 #ifndef M_PI
38 #define M_PI 3.1415926535897932384626433832795029
39 #endif
40 #include <inttypes.h>
42 #include "a52.h"
43 #include "a52_internal.h"
44 #include "mm_accel.h"
45 #include "mangle.h"
47 #ifdef RUNTIME_CPUDETECT
48 #undef HAVE_3DNOWEX
49 #endif
51 #define USE_AC3_C
53 void (* imdct_256) (sample_t data[], sample_t delay[], sample_t bias);
54 void (* imdct_512) (sample_t data[], sample_t delay[], sample_t bias);
56 typedef struct complex_s {
57 sample_t real;
58 sample_t imag;
59 } complex_t;
61 static void fft_128p(complex_t *a);
63 static const int pm128[128] attribute_used __attribute__((aligned(16))) =
65 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
66 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,
67 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,
68 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,
69 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,
70 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
71 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
72 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
73 };
75 /* 128 point bit-reverse LUT */
76 static uint8_t attribute_used bit_reverse_512[] = {
77 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
78 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
79 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
80 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
81 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
82 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
83 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
84 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
85 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
86 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
87 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
88 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
89 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
90 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
91 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
92 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
94 static uint8_t bit_reverse_256[] = {
95 0x00, 0x20, 0x10, 0x30, 0x08, 0x28, 0x18, 0x38,
96 0x04, 0x24, 0x14, 0x34, 0x0c, 0x2c, 0x1c, 0x3c,
97 0x02, 0x22, 0x12, 0x32, 0x0a, 0x2a, 0x1a, 0x3a,
98 0x06, 0x26, 0x16, 0x36, 0x0e, 0x2e, 0x1e, 0x3e,
99 0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39,
100 0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d,
101 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b,
102 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
104 #ifdef ARCH_X86
105 // NOTE: SSE needs 16byte alignment or it will segfault
107 static complex_t __attribute__((aligned(16))) buf[128];
108 static float __attribute__((aligned(16))) sseSinCos1c[256];
109 static float __attribute__((aligned(16))) sseSinCos1d[256];
110 static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
111 //static float __attribute__((aligned(16))) sseW0[4];
112 static float __attribute__((aligned(16))) sseW1[8];
113 static float __attribute__((aligned(16))) sseW2[16];
114 static float __attribute__((aligned(16))) sseW3[32];
115 static float __attribute__((aligned(16))) sseW4[64];
116 static float __attribute__((aligned(16))) sseW5[128];
117 static float __attribute__((aligned(16))) sseW6[256];
118 static float __attribute__((aligned(16))) *sseW[7]=
119 {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
120 static float __attribute__((aligned(16))) sseWindow[512];
121 #else
122 static complex_t __attribute__((aligned(16))) buf[128];
123 #endif
125 /* Twiddle factor LUT */
126 static complex_t __attribute__((aligned(16))) w_1[1];
127 static complex_t __attribute__((aligned(16))) w_2[2];
128 static complex_t __attribute__((aligned(16))) w_4[4];
129 static complex_t __attribute__((aligned(16))) w_8[8];
130 static complex_t __attribute__((aligned(16))) w_16[16];
131 static complex_t __attribute__((aligned(16))) w_32[32];
132 static complex_t __attribute__((aligned(16))) w_64[64];
133 static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
135 /* Twiddle factors for IMDCT */
136 static sample_t __attribute__((aligned(16))) xcos1[128];
137 static sample_t __attribute__((aligned(16))) xsin1[128];
138 static sample_t __attribute__((aligned(16))) xcos2[64];
139 static sample_t __attribute__((aligned(16))) xsin2[64];
141 /* Windowing function for Modified DCT - Thank you acroread */
142 sample_t imdct_window[] = {
143 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
144 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
145 0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
146 0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
147 0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
148 0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
149 0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
150 0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
151 0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
152 0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
153 0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
154 0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
155 0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
156 0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
157 0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
158 0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
159 0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
160 0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
161 0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
162 0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
163 0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
164 0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
165 0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
166 0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
167 0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
168 0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
169 0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
170 0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
171 0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
172 0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
173 0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
174 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 };
177 static inline void swap_cmplx(complex_t *a, complex_t *b)
179 complex_t tmp;
181 tmp = *a;
182 *a = *b;
183 *b = tmp;
188 static inline complex_t cmplx_mult(complex_t a, complex_t b)
190 complex_t ret;
192 ret.real = a.real * b.real - a.imag * b.imag;
193 ret.imag = a.real * b.imag + a.imag * b.real;
195 return ret;
198 void
199 imdct_do_512(sample_t data[],sample_t delay[], sample_t bias)
201 int i;
202 #ifndef USE_AC3_C
203 int k;
204 int p,q;
205 int m;
206 int two_m;
207 int two_m_plus_one;
209 sample_t tmp_b_i;
210 sample_t tmp_b_r;
211 #endif
212 sample_t tmp_a_i;
213 sample_t tmp_a_r;
215 sample_t *data_ptr;
216 sample_t *delay_ptr;
217 sample_t *window_ptr;
219 /* 512 IMDCT with source and dest data in 'data' */
221 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
222 for( i=0; i < 128; i++) {
223 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
224 #ifdef USE_AC3_C
225 int j= pm128[i];
226 #else
227 int j= bit_reverse_512[i];
228 #endif
229 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
230 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
233 /* FFT Merge */
234 /* unoptimized variant
235 for (m=1; m < 7; m++) {
236 if(m)
237 two_m = (1 << m);
238 else
239 two_m = 1;
241 two_m_plus_one = (1 << (m+1));
243 for(i = 0; i < 128; i += two_m_plus_one) {
244 for(k = 0; k < two_m; k++) {
245 p = k + i;
246 q = p + two_m;
247 tmp_a_r = buf[p].real;
248 tmp_a_i = buf[p].imag;
249 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
250 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
251 buf[p].real = tmp_a_r + tmp_b_r;
252 buf[p].imag = tmp_a_i + tmp_b_i;
253 buf[q].real = tmp_a_r - tmp_b_r;
254 buf[q].imag = tmp_a_i - tmp_b_i;
259 #ifdef USE_AC3_C
260 fft_128p (&buf[0]);
261 #else
263 /* 1. iteration */
264 for(i = 0; i < 128; i += 2) {
265 tmp_a_r = buf[i].real;
266 tmp_a_i = buf[i].imag;
267 tmp_b_r = buf[i+1].real;
268 tmp_b_i = buf[i+1].imag;
269 buf[i].real = tmp_a_r + tmp_b_r;
270 buf[i].imag = tmp_a_i + tmp_b_i;
271 buf[i+1].real = tmp_a_r - tmp_b_r;
272 buf[i+1].imag = tmp_a_i - tmp_b_i;
275 /* 2. iteration */
276 // Note w[1]={{1,0}, {0,-1}}
277 for(i = 0; i < 128; i += 4) {
278 tmp_a_r = buf[i].real;
279 tmp_a_i = buf[i].imag;
280 tmp_b_r = buf[i+2].real;
281 tmp_b_i = buf[i+2].imag;
282 buf[i].real = tmp_a_r + tmp_b_r;
283 buf[i].imag = tmp_a_i + tmp_b_i;
284 buf[i+2].real = tmp_a_r - tmp_b_r;
285 buf[i+2].imag = tmp_a_i - tmp_b_i;
286 tmp_a_r = buf[i+1].real;
287 tmp_a_i = buf[i+1].imag;
288 tmp_b_r = buf[i+3].imag;
289 tmp_b_i = buf[i+3].real;
290 buf[i+1].real = tmp_a_r + tmp_b_r;
291 buf[i+1].imag = tmp_a_i - tmp_b_i;
292 buf[i+3].real = tmp_a_r - tmp_b_r;
293 buf[i+3].imag = tmp_a_i + tmp_b_i;
296 /* 3. iteration */
297 for(i = 0; i < 128; i += 8) {
298 tmp_a_r = buf[i].real;
299 tmp_a_i = buf[i].imag;
300 tmp_b_r = buf[i+4].real;
301 tmp_b_i = buf[i+4].imag;
302 buf[i].real = tmp_a_r + tmp_b_r;
303 buf[i].imag = tmp_a_i + tmp_b_i;
304 buf[i+4].real = tmp_a_r - tmp_b_r;
305 buf[i+4].imag = tmp_a_i - tmp_b_i;
306 tmp_a_r = buf[1+i].real;
307 tmp_a_i = buf[1+i].imag;
308 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
309 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
310 buf[1+i].real = tmp_a_r + tmp_b_r;
311 buf[1+i].imag = tmp_a_i + tmp_b_i;
312 buf[i+5].real = tmp_a_r - tmp_b_r;
313 buf[i+5].imag = tmp_a_i - tmp_b_i;
314 tmp_a_r = buf[i+2].real;
315 tmp_a_i = buf[i+2].imag;
316 tmp_b_r = buf[i+6].imag;
317 tmp_b_i = - buf[i+6].real;
318 buf[i+2].real = tmp_a_r + tmp_b_r;
319 buf[i+2].imag = tmp_a_i + tmp_b_i;
320 buf[i+6].real = tmp_a_r - tmp_b_r;
321 buf[i+6].imag = tmp_a_i - tmp_b_i;
322 tmp_a_r = buf[i+3].real;
323 tmp_a_i = buf[i+3].imag;
324 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
325 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
326 buf[i+3].real = tmp_a_r + tmp_b_r;
327 buf[i+3].imag = tmp_a_i + tmp_b_i;
328 buf[i+7].real = tmp_a_r - tmp_b_r;
329 buf[i+7].imag = tmp_a_i - tmp_b_i;
332 /* 4-7. iterations */
333 for (m=3; m < 7; m++) {
334 two_m = (1 << m);
336 two_m_plus_one = two_m<<1;
338 for(i = 0; i < 128; i += two_m_plus_one) {
339 for(k = 0; k < two_m; k++) {
340 int p = k + i;
341 int q = p + two_m;
342 tmp_a_r = buf[p].real;
343 tmp_a_i = buf[p].imag;
344 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
345 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
346 buf[p].real = tmp_a_r + tmp_b_r;
347 buf[p].imag = tmp_a_i + tmp_b_i;
348 buf[q].real = tmp_a_r - tmp_b_r;
349 buf[q].imag = tmp_a_i - tmp_b_i;
353 #endif
354 /* Post IFFT complex multiply plus IFFT complex conjugate*/
355 for( i=0; i < 128; i++) {
356 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
357 tmp_a_r = buf[i].real;
358 tmp_a_i = -1.0 * buf[i].imag;
359 buf[i].real =(tmp_a_r * xcos1[i]) - (tmp_a_i * xsin1[i]);
360 buf[i].imag =(tmp_a_r * xsin1[i]) + (tmp_a_i * xcos1[i]);
363 data_ptr = data;
364 delay_ptr = delay;
365 window_ptr = imdct_window;
367 /* Window and convert to real valued signal */
368 for(i=0; i< 64; i++) {
369 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
370 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
373 for(i=0; i< 64; i++) {
374 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
375 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
378 /* The trailing edge of the window goes into the delay line */
379 delay_ptr = delay;
381 for(i=0; i< 64; i++) {
382 *delay_ptr++ = -buf[64+i].real * *--window_ptr;
383 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
386 for(i=0; i<64; i++) {
387 *delay_ptr++ = buf[i].imag * *--window_ptr;
388 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
392 #ifdef HAVE_ALTIVEC
394 #ifndef SYS_DARWIN
395 #include <altivec.h>
396 #endif
398 // used to build registers permutation vectors (vcprm)
399 // the 's' are for words in the _s_econd vector
400 #define WORD_0 0x00,0x01,0x02,0x03
401 #define WORD_1 0x04,0x05,0x06,0x07
402 #define WORD_2 0x08,0x09,0x0a,0x0b
403 #define WORD_3 0x0c,0x0d,0x0e,0x0f
404 #define WORD_s0 0x10,0x11,0x12,0x13
405 #define WORD_s1 0x14,0x15,0x16,0x17
406 #define WORD_s2 0x18,0x19,0x1a,0x1b
407 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
409 #ifdef SYS_DARWIN
410 #define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
411 #else
412 #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
413 #endif
415 // vcprmle is used to keep the same index as in the SSE version.
416 // it's the same as vcprm, with the index inversed
417 // ('le' is Little Endian)
418 #define vcprmle(a,b,c,d) vcprm(d,c,b,a)
420 // used to build inverse/identity vectors (vcii)
421 // n is _n_egative, p is _p_ositive
422 #define FLOAT_n -1.
423 #define FLOAT_p 1.
425 #ifdef SYS_DARWIN
426 #define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
427 #else
428 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
429 #endif
431 #ifdef SYS_DARWIN
432 #define FOUROF(a) (a)
433 #else
434 #define FOUROF(a) {a,a,a,a}
435 #endif
438 void
439 imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
441 int i;
442 int k;
443 int p,q;
444 int m;
445 int two_m;
446 int two_m_plus_one;
448 sample_t tmp_b_i;
449 sample_t tmp_b_r;
450 sample_t tmp_a_i;
451 sample_t tmp_a_r;
453 sample_t *data_ptr;
454 sample_t *delay_ptr;
455 sample_t *window_ptr;
457 /* 512 IMDCT with source and dest data in 'data' */
459 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
460 for( i=0; i < 128; i++) {
461 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
462 int j= bit_reverse_512[i];
463 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
464 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
467 /* 1. iteration */
468 for(i = 0; i < 128; i += 2) {
469 #if 0
470 tmp_a_r = buf[i].real;
471 tmp_a_i = buf[i].imag;
472 tmp_b_r = buf[i+1].real;
473 tmp_b_i = buf[i+1].imag;
474 buf[i].real = tmp_a_r + tmp_b_r;
475 buf[i].imag = tmp_a_i + tmp_b_i;
476 buf[i+1].real = tmp_a_r - tmp_b_r;
477 buf[i+1].imag = tmp_a_i - tmp_b_i;
478 #else
479 vector float temp, bufv;
481 bufv = vec_ld(i << 3, (float*)buf);
482 temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
483 bufv = vec_madd(bufv, vcii(p,p,n,n), temp);
484 vec_st(bufv, i << 3, (float*)buf);
485 #endif
488 /* 2. iteration */
489 // Note w[1]={{1,0}, {0,-1}}
490 for(i = 0; i < 128; i += 4) {
491 #if 0
492 tmp_a_r = buf[i].real;
493 tmp_a_i = buf[i].imag;
494 tmp_b_r = buf[i+2].real;
495 tmp_b_i = buf[i+2].imag;
496 buf[i].real = tmp_a_r + tmp_b_r;
497 buf[i].imag = tmp_a_i + tmp_b_i;
498 buf[i+2].real = tmp_a_r - tmp_b_r;
499 buf[i+2].imag = tmp_a_i - tmp_b_i;
500 tmp_a_r = buf[i+1].real;
501 tmp_a_i = buf[i+1].imag;
502 /* WARNING: im <-> re here ! */
503 tmp_b_r = buf[i+3].imag;
504 tmp_b_i = buf[i+3].real;
505 buf[i+1].real = tmp_a_r + tmp_b_r;
506 buf[i+1].imag = tmp_a_i - tmp_b_i;
507 buf[i+3].real = tmp_a_r - tmp_b_r;
508 buf[i+3].imag = tmp_a_i + tmp_b_i;
509 #else
510 vector float buf01, buf23, temp1, temp2;
512 buf01 = vec_ld((i + 0) << 3, (float*)buf);
513 buf23 = vec_ld((i + 2) << 3, (float*)buf);
514 buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
516 temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);
517 temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);
519 vec_st(temp1, (i + 0) << 3, (float*)buf);
520 vec_st(temp2, (i + 2) << 3, (float*)buf);
521 #endif
524 /* 3. iteration */
525 for(i = 0; i < 128; i += 8) {
526 #if 0
527 tmp_a_r = buf[i].real;
528 tmp_a_i = buf[i].imag;
529 tmp_b_r = buf[i+4].real;
530 tmp_b_i = buf[i+4].imag;
531 buf[i].real = tmp_a_r + tmp_b_r;
532 buf[i].imag = tmp_a_i + tmp_b_i;
533 buf[i+4].real = tmp_a_r - tmp_b_r;
534 buf[i+4].imag = tmp_a_i - tmp_b_i;
535 tmp_a_r = buf[1+i].real;
536 tmp_a_i = buf[1+i].imag;
537 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
538 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
539 buf[1+i].real = tmp_a_r + tmp_b_r;
540 buf[1+i].imag = tmp_a_i + tmp_b_i;
541 buf[i+5].real = tmp_a_r - tmp_b_r;
542 buf[i+5].imag = tmp_a_i - tmp_b_i;
543 tmp_a_r = buf[i+2].real;
544 tmp_a_i = buf[i+2].imag;
545 /* WARNING re <-> im & sign */
546 tmp_b_r = buf[i+6].imag;
547 tmp_b_i = - buf[i+6].real;
548 buf[i+2].real = tmp_a_r + tmp_b_r;
549 buf[i+2].imag = tmp_a_i + tmp_b_i;
550 buf[i+6].real = tmp_a_r - tmp_b_r;
551 buf[i+6].imag = tmp_a_i - tmp_b_i;
552 tmp_a_r = buf[i+3].real;
553 tmp_a_i = buf[i+3].imag;
554 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
555 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
556 buf[i+3].real = tmp_a_r + tmp_b_r;
557 buf[i+3].imag = tmp_a_i + tmp_b_i;
558 buf[i+7].real = tmp_a_r - tmp_b_r;
559 buf[i+7].imag = tmp_a_i - tmp_b_i;
560 #else
561 vector float buf01, buf23, buf45, buf67;
563 buf01 = vec_ld((i + 0) << 3, (float*)buf);
564 buf23 = vec_ld((i + 2) << 3, (float*)buf);
566 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
567 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
568 buf[i+5].real = tmp_b_r;
569 buf[i+5].imag = tmp_b_i;
570 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
571 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
572 buf[i+7].real = tmp_b_r;
573 buf[i+7].imag = tmp_b_i;
575 buf23 = vec_ld((i + 2) << 3, (float*)buf);
576 buf45 = vec_ld((i + 4) << 3, (float*)buf);
577 buf67 = vec_ld((i + 6) << 3, (float*)buf);
578 buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
580 vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
581 vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
582 vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
583 vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
584 #endif
587 /* 4-7. iterations */
588 for (m=3; m < 7; m++) {
589 two_m = (1 << m);
591 two_m_plus_one = two_m<<1;
593 for(i = 0; i < 128; i += two_m_plus_one) {
594 for(k = 0; k < two_m; k+=2) {
595 #if 0
596 int p = k + i;
597 int q = p + two_m;
598 tmp_a_r = buf[p].real;
599 tmp_a_i = buf[p].imag;
600 tmp_b_r =
601 buf[q].real * w[m][k].real -
602 buf[q].imag * w[m][k].imag;
603 tmp_b_i =
604 buf[q].imag * w[m][k].real +
605 buf[q].real * w[m][k].imag;
606 buf[p].real = tmp_a_r + tmp_b_r;
607 buf[p].imag = tmp_a_i + tmp_b_i;
608 buf[q].real = tmp_a_r - tmp_b_r;
609 buf[q].imag = tmp_a_i - tmp_b_i;
611 tmp_a_r = buf[(p + 1)].real;
612 tmp_a_i = buf[(p + 1)].imag;
613 tmp_b_r =
614 buf[(q + 1)].real * w[m][(k + 1)].real -
615 buf[(q + 1)].imag * w[m][(k + 1)].imag;
616 tmp_b_i =
617 buf[(q + 1)].imag * w[m][(k + 1)].real +
618 buf[(q + 1)].real * w[m][(k + 1)].imag;
619 buf[(p + 1)].real = tmp_a_r + tmp_b_r;
620 buf[(p + 1)].imag = tmp_a_i + tmp_b_i;
621 buf[(q + 1)].real = tmp_a_r - tmp_b_r;
622 buf[(q + 1)].imag = tmp_a_i - tmp_b_i;
623 #else
624 int p = k + i;
625 int q = p + two_m;
626 vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;
627 const vector float vczero = (const vector float)FOUROF(0.);
628 // first compute buf[q] and buf[q+1]
629 vecq = vec_ld(q << 3, (float*)buf);
630 vecw = vec_ld(0, (float*)&(w[m][k]));
631 temp1 = vec_madd(vecq, vecw, vczero);
632 temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));
633 temp2 = vec_madd(temp2, vecw, vczero);
634 temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));
635 temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));
636 vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
637 // then butterfly with buf[p] and buf[p+1]
638 vecp = vec_ld(p << 3, (float*)buf);
640 temp1 = vec_add(vecp, vecq);
641 temp2 = vec_sub(vecp, vecq);
643 vec_st(temp1, p << 3, (float*)buf);
644 vec_st(temp2, q << 3, (float*)buf);
645 #endif
650 /* Post IFFT complex multiply plus IFFT complex conjugate*/
651 for( i=0; i < 128; i+=4) {
652 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
653 #if 0
654 tmp_a_r = buf[(i + 0)].real;
655 tmp_a_i = -1.0 * buf[(i + 0)].imag;
656 buf[(i + 0)].real =
657 (tmp_a_r * xcos1[(i + 0)]) - (tmp_a_i * xsin1[(i + 0)]);
658 buf[(i + 0)].imag =
659 (tmp_a_r * xsin1[(i + 0)]) + (tmp_a_i * xcos1[(i + 0)]);
661 tmp_a_r = buf[(i + 1)].real;
662 tmp_a_i = -1.0 * buf[(i + 1)].imag;
663 buf[(i + 1)].real =
664 (tmp_a_r * xcos1[(i + 1)]) - (tmp_a_i * xsin1[(i + 1)]);
665 buf[(i + 1)].imag =
666 (tmp_a_r * xsin1[(i + 1)]) + (tmp_a_i * xcos1[(i + 1)]);
668 tmp_a_r = buf[(i + 2)].real;
669 tmp_a_i = -1.0 * buf[(i + 2)].imag;
670 buf[(i + 2)].real =
671 (tmp_a_r * xcos1[(i + 2)]) - (tmp_a_i * xsin1[(i + 2)]);
672 buf[(i + 2)].imag =
673 (tmp_a_r * xsin1[(i + 2)]) + (tmp_a_i * xcos1[(i + 2)]);
675 tmp_a_r = buf[(i + 3)].real;
676 tmp_a_i = -1.0 * buf[(i + 3)].imag;
677 buf[(i + 3)].real =
678 (tmp_a_r * xcos1[(i + 3)]) - (tmp_a_i * xsin1[(i + 3)]);
679 buf[(i + 3)].imag =
680 (tmp_a_r * xsin1[(i + 3)]) + (tmp_a_i * xcos1[(i + 3)]);
681 #else
682 vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;
683 vector float temp0022, temp1133, tempCS01;
684 const vector float vczero = (const vector float)FOUROF(0.);
686 bufv_0 = vec_ld((i + 0) << 3, (float*)buf);
687 bufv_2 = vec_ld((i + 2) << 3, (float*)buf);
689 cosv = vec_ld(i << 2, xcos1);
690 sinv = vec_ld(i << 2, xsin1);
692 temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));
693 temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));
694 tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));
695 temp1 = vec_madd(temp0022, tempCS01, vczero);
696 tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
697 temp2 = vec_madd(temp1133, tempCS01, vczero);
698 bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
700 vec_st(bufv_0, (i + 0) << 3, (float*)buf);
702 /* idem with bufv_2 and high-order cosv/sinv */
704 temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));
705 temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));
706 tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));
707 temp1 = vec_madd(temp0022, tempCS01, vczero);
708 tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));
709 temp2 = vec_madd(temp1133, tempCS01, vczero);
710 bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
712 vec_st(bufv_2, (i + 2) << 3, (float*)buf);
714 #endif
717 data_ptr = data;
718 delay_ptr = delay;
719 window_ptr = imdct_window;
721 /* Window and convert to real valued signal */
722 for(i=0; i< 64; i++) {
723 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
724 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
727 for(i=0; i< 64; i++) {
728 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
729 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
732 /* The trailing edge of the window goes into the delay line */
733 delay_ptr = delay;
735 for(i=0; i< 64; i++) {
736 *delay_ptr++ = -buf[64+i].real * *--window_ptr;
737 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
740 for(i=0; i<64; i++) {
741 *delay_ptr++ = buf[i].imag * *--window_ptr;
742 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
745 #endif
748 // Stuff below this line is borrowed from libac3
749 #include "srfftp.h"
750 #ifdef ARCH_X86
751 #ifndef HAVE_3DNOW
752 #define HAVE_3DNOW 1
753 #endif
754 #include "srfftp_3dnow.h"
756 const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
757 const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
758 const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
760 #undef HAVE_3DNOWEX
761 #include "imdct_3dnow.h"
762 #define HAVE_3DNOWEX
763 #include "imdct_3dnow.h"
765 void
766 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
768 /* int i,k;
769 int p,q;*/
770 int m;
771 int two_m;
772 int two_m_plus_one;
774 /* sample_t tmp_a_i;
775 sample_t tmp_a_r;
776 sample_t tmp_b_i;
777 sample_t tmp_b_r;*/
779 sample_t *data_ptr;
780 sample_t *delay_ptr;
781 sample_t *window_ptr;
783 /* 512 IMDCT with source and dest data in 'data' */
784 /* see the c version (dct_do_512()), its allmost identical, just in C */
786 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
787 /* Bit reversed shuffling */
788 asm volatile(
789 "xorl %%esi, %%esi \n\t"
790 "leal "MANGLE(bit_reverse_512)", %%eax \n\t"
791 "movl $1008, %%edi \n\t"
792 "pushl %%ebp \n\t" //use ebp without telling gcc
793 ".balign 16 \n\t"
794 "1: \n\t"
795 "movlps (%0, %%esi), %%xmm0 \n\t" // XXXI
796 "movhps 8(%0, %%edi), %%xmm0 \n\t" // RXXI
797 "movlps 8(%0, %%esi), %%xmm1 \n\t" // XXXi
798 "movhps (%0, %%edi), %%xmm1 \n\t" // rXXi
799 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR
800 "movaps "MANGLE(sseSinCos1c)"(%%esi), %%xmm2\n\t"
801 "mulps %%xmm0, %%xmm2 \n\t"
802 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI
803 "mulps "MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"
804 "subps %%xmm0, %%xmm2 \n\t"
805 "movzbl (%%eax), %%edx \n\t"
806 "movzbl 1(%%eax), %%ebp \n\t"
807 "movlps %%xmm2, (%1, %%edx,8) \n\t"
808 "movhps %%xmm2, (%1, %%ebp,8) \n\t"
809 "addl $16, %%esi \n\t"
810 "addl $2, %%eax \n\t" // avoid complex addressing for P4 crap
811 "subl $16, %%edi \n\t"
812 " jnc 1b \n\t"
813 "popl %%ebp \n\t"//no we didnt touch ebp *g*
814 :: "b" (data), "c" (buf)
815 : "%esi", "%edi", "%eax", "%edx"
819 /* FFT Merge */
820 /* unoptimized variant
821 for (m=1; m < 7; m++) {
822 if(m)
823 two_m = (1 << m);
824 else
825 two_m = 1;
827 two_m_plus_one = (1 << (m+1));
829 for(i = 0; i < 128; i += two_m_plus_one) {
830 for(k = 0; k < two_m; k++) {
831 p = k + i;
832 q = p + two_m;
833 tmp_a_r = buf[p].real;
834 tmp_a_i = buf[p].imag;
835 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
836 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
837 buf[p].real = tmp_a_r + tmp_b_r;
838 buf[p].imag = tmp_a_i + tmp_b_i;
839 buf[q].real = tmp_a_r - tmp_b_r;
840 buf[q].imag = tmp_a_i - tmp_b_i;
846 /* 1. iteration */
847 // Note w[0][0]={1,0}
848 asm volatile(
849 "xorps %%xmm1, %%xmm1 \n\t"
850 "xorps %%xmm2, %%xmm2 \n\t"
851 "movl %0, %%esi \n\t"
852 ".balign 16 \n\t"
853 "1: \n\t"
854 "movlps (%%esi), %%xmm0 \n\t" //buf[p]
855 "movlps 8(%%esi), %%xmm1\n\t" //buf[q]
856 "movhps (%%esi), %%xmm0 \n\t" //buf[p]
857 "movhps 8(%%esi), %%xmm2\n\t" //buf[q]
858 "addps %%xmm1, %%xmm0 \n\t"
859 "subps %%xmm2, %%xmm0 \n\t"
860 "movaps %%xmm0, (%%esi) \n\t"
861 "addl $16, %%esi \n\t"
862 "cmpl %1, %%esi \n\t"
863 " jb 1b \n\t"
864 :: "g" (buf), "r" (buf + 128)
865 : "%esi"
868 /* 2. iteration */
869 // Note w[1]={{1,0}, {0,-1}}
870 asm volatile(
871 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
872 "movl %0, %%esi \n\t"
873 ".balign 16 \n\t"
874 "1: \n\t"
875 "movaps 16(%%esi), %%xmm2 \n\t" //r2,i2,r3,i3
876 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3
877 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3
878 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1
879 "movaps (%%esi), %%xmm1 \n\t" //r0,i0,r1,i1
880 "addps %%xmm2, %%xmm0 \n\t"
881 "subps %%xmm2, %%xmm1 \n\t"
882 "movaps %%xmm0, (%%esi) \n\t"
883 "movaps %%xmm1, 16(%%esi) \n\t"
884 "addl $32, %%esi \n\t"
885 "cmpl %1, %%esi \n\t"
886 " jb 1b \n\t"
887 :: "g" (buf), "r" (buf + 128)
888 : "%esi"
891 /* 3. iteration */
893 Note sseW2+0={1,1,sqrt(2),sqrt(2))
894 Note sseW2+16={0,0,sqrt(2),-sqrt(2))
895 Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
896 Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
898 asm volatile(
899 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
900 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
901 "xorps %%xmm5, %%xmm5 \n\t"
902 "xorps %%xmm2, %%xmm2 \n\t"
903 "movl %0, %%esi \n\t"
904 ".balign 16 \n\t"
905 "1: \n\t"
906 "movaps 32(%%esi), %%xmm2 \n\t" //r4,i4,r5,i5
907 "movaps 48(%%esi), %%xmm3 \n\t" //r6,i6,r7,i7
908 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
909 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
910 "mulps %%xmm2, %%xmm4 \n\t"
911 "mulps %%xmm3, %%xmm5 \n\t"
912 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5
913 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7
914 "mulps %%xmm6, %%xmm3 \n\t"
915 "mulps %%xmm7, %%xmm2 \n\t"
916 "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1
917 "movaps 16(%%esi), %%xmm1 \n\t" //r2,i2,r3,i3
918 "addps %%xmm4, %%xmm2 \n\t"
919 "addps %%xmm5, %%xmm3 \n\t"
920 "movaps %%xmm2, %%xmm4 \n\t"
921 "movaps %%xmm3, %%xmm5 \n\t"
922 "addps %%xmm0, %%xmm2 \n\t"
923 "addps %%xmm1, %%xmm3 \n\t"
924 "subps %%xmm4, %%xmm0 \n\t"
925 "subps %%xmm5, %%xmm1 \n\t"
926 "movaps %%xmm2, (%%esi) \n\t"
927 "movaps %%xmm3, 16(%%esi) \n\t"
928 "movaps %%xmm0, 32(%%esi) \n\t"
929 "movaps %%xmm1, 48(%%esi) \n\t"
930 "addl $64, %%esi \n\t"
931 "cmpl %1, %%esi \n\t"
932 " jb 1b \n\t"
933 :: "g" (buf), "r" (buf + 128)
934 : "%esi"
937 /* 4-7. iterations */
938 for (m=3; m < 7; m++) {
939 two_m = (1 << m);
940 two_m_plus_one = two_m<<1;
941 asm volatile(
942 "movl %0, %%esi \n\t"
943 ".balign 16 \n\t"
944 "1: \n\t"
945 "xorl %%edi, %%edi \n\t" // k
946 "leal (%%esi, %3), %%edx \n\t"
947 "2: \n\t"
948 "movaps (%%edx, %%edi), %%xmm1 \n\t"
949 "movaps (%4, %%edi, 2), %%xmm2 \n\t"
950 "mulps %%xmm1, %%xmm2 \n\t"
951 "shufps $0xB1, %%xmm1, %%xmm1 \n\t"
952 "mulps 16(%4, %%edi, 2), %%xmm1 \n\t"
953 "movaps (%%esi, %%edi), %%xmm0 \n\t"
954 "addps %%xmm2, %%xmm1 \n\t"
955 "movaps %%xmm1, %%xmm2 \n\t"
956 "addps %%xmm0, %%xmm1 \n\t"
957 "subps %%xmm2, %%xmm0 \n\t"
958 "movaps %%xmm1, (%%esi, %%edi) \n\t"
959 "movaps %%xmm0, (%%edx, %%edi) \n\t"
960 "addl $16, %%edi \n\t"
961 "cmpl %3, %%edi \n\t" //FIXME (opt) count against 0
962 " jb 2b \n\t"
963 "addl %2, %%esi \n\t"
964 "cmpl %1, %%esi \n\t"
965 " jb 1b \n\t"
966 :: "g" (buf), "m" (buf+128), "m" (two_m_plus_one<<3), "r" (two_m<<3),
967 "r" (sseW[m])
968 : "%esi", "%edi", "%edx"
972 /* Post IFFT complex multiply plus IFFT complex conjugate*/
973 asm volatile(
974 "movl $-1024, %%esi \n\t"
975 ".balign 16 \n\t"
976 "1: \n\t"
977 "movaps (%0, %%esi), %%xmm0 \n\t"
978 "movaps (%0, %%esi), %%xmm1 \n\t"
979 "shufps $0xB1, %%xmm0, %%xmm0 \n\t"
980 "mulps 1024+"MANGLE(sseSinCos1c)"(%%esi), %%xmm1\n\t"
981 "mulps 1024+"MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"
982 "addps %%xmm1, %%xmm0 \n\t"
983 "movaps %%xmm0, (%0, %%esi) \n\t"
984 "addl $16, %%esi \n\t"
985 " jnz 1b \n\t"
986 :: "r" (buf+128)
987 : "%esi"
991 data_ptr = data;
992 delay_ptr = delay;
993 window_ptr = imdct_window;
995 /* Window and convert to real valued signal */
996 asm volatile(
997 "xorl %%edi, %%edi \n\t" // 0
998 "xorl %%esi, %%esi \n\t" // 0
999 "movss %3, %%xmm2 \n\t" // bias
1000 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
1001 ".balign 16 \n\t"
1002 "1: \n\t"
1003 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ?
1004 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ?
1005 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ?
1006 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ?
1007 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
1008 "mulps "MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
1009 "addps (%2, %%esi), %%xmm0 \n\t"
1010 "addps %%xmm2, %%xmm0 \n\t"
1011 "movaps %%xmm0, (%1, %%esi) \n\t"
1012 "addl $16, %%esi \n\t"
1013 "subl $16, %%edi \n\t"
1014 "cmpl $512, %%esi \n\t"
1015 " jb 1b \n\t"
1016 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1017 : "%esi", "%edi"
1019 data_ptr+=128;
1020 delay_ptr+=128;
1021 // window_ptr+=128;
1023 asm volatile(
1024 "movl $1024, %%edi \n\t" // 512
1025 "xorl %%esi, %%esi \n\t" // 0
1026 "movss %3, %%xmm2 \n\t" // bias
1027 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
1028 ".balign 16 \n\t"
1029 "1: \n\t"
1030 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A
1031 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C
1032 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C
1033 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A
1034 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
1035 "mulps 512+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
1036 "addps (%2, %%esi), %%xmm0 \n\t"
1037 "addps %%xmm2, %%xmm0 \n\t"
1038 "movaps %%xmm0, (%1, %%esi) \n\t"
1039 "addl $16, %%esi \n\t"
1040 "subl $16, %%edi \n\t"
1041 "cmpl $512, %%esi \n\t"
1042 " jb 1b \n\t"
1043 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1044 : "%esi", "%edi"
1046 data_ptr+=128;
1047 // window_ptr+=128;
1049 /* The trailing edge of the window goes into the delay line */
1050 delay_ptr = delay;
1052 asm volatile(
1053 "xorl %%edi, %%edi \n\t" // 0
1054 "xorl %%esi, %%esi \n\t" // 0
1055 ".balign 16 \n\t"
1056 "1: \n\t"
1057 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A
1058 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C
1059 "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C
1060 "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A
1061 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
1062 "mulps 1024+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
1063 "movaps %%xmm0, (%1, %%esi) \n\t"
1064 "addl $16, %%esi \n\t"
1065 "subl $16, %%edi \n\t"
1066 "cmpl $512, %%esi \n\t"
1067 " jb 1b \n\t"
1068 :: "r" (buf+64), "r" (delay_ptr)
1069 : "%esi", "%edi"
1071 delay_ptr+=128;
1072 // window_ptr-=128;
1074 asm volatile(
1075 "movl $1024, %%edi \n\t" // 1024
1076 "xorl %%esi, %%esi \n\t" // 0
1077 ".balign 16 \n\t"
1078 "1: \n\t"
1079 "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ?
1080 "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ?
1081 "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ?
1082 "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ?
1083 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
1084 "mulps 1536+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
1085 "movaps %%xmm0, (%1, %%esi) \n\t"
1086 "addl $16, %%esi \n\t"
1087 "subl $16, %%edi \n\t"
1088 "cmpl $512, %%esi \n\t"
1089 " jb 1b \n\t"
1090 :: "r" (buf), "r" (delay_ptr)
1091 : "%esi", "%edi"
1094 #endif //arch_x86
1096 void
1097 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias)
1099 int i,k;
1100 int p,q;
1101 int m;
1102 int two_m;
1103 int two_m_plus_one;
1105 sample_t tmp_a_i;
1106 sample_t tmp_a_r;
1107 sample_t tmp_b_i;
1108 sample_t tmp_b_r;
1110 sample_t *data_ptr;
1111 sample_t *delay_ptr;
1112 sample_t *window_ptr;
1114 complex_t *buf_1, *buf_2;
1116 buf_1 = &buf[0];
1117 buf_2 = &buf[64];
1119 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
1120 for(k=0; k<64; k++) {
1121 /* X1[k] = X[2*k] */
1122 /* X2[k] = X[2*k+1] */
1124 p = 2 * (128-2*k-1);
1125 q = 2 * (2 * k);
1127 /* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
1128 buf_1[k].real = data[p] * xcos2[k] - data[q] * xsin2[k];
1129 buf_1[k].imag = -1.0f * (data[q] * xcos2[k] + data[p] * xsin2[k]);
1130 /* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
1131 buf_2[k].real = data[p + 1] * xcos2[k] - data[q + 1] * xsin2[k];
1132 buf_2[k].imag = -1.0f * ( data[q + 1] * xcos2[k] + data[p + 1] * xsin2[k]);
1135 /* IFFT Bit reversed shuffling */
1136 for(i=0; i<64; i++) {
1137 k = bit_reverse_256[i];
1138 if (k < i) {
1139 swap_cmplx(&buf_1[i],&buf_1[k]);
1140 swap_cmplx(&buf_2[i],&buf_2[k]);
1144 /* FFT Merge */
1145 for (m=0; m < 6; m++) {
1146 two_m = (1 << m);
1147 two_m_plus_one = (1 << (m+1));
1149 /* FIXME */
1150 if(m)
1151 two_m = (1 << m);
1152 else
1153 two_m = 1;
1155 for(k = 0; k < two_m; k++) {
1156 for(i = 0; i < 64; i += two_m_plus_one) {
1157 p = k + i;
1158 q = p + two_m;
1159 /* Do block 1 */
1160 tmp_a_r = buf_1[p].real;
1161 tmp_a_i = buf_1[p].imag;
1162 tmp_b_r = buf_1[q].real * w[m][k].real - buf_1[q].imag * w[m][k].imag;
1163 tmp_b_i = buf_1[q].imag * w[m][k].real + buf_1[q].real * w[m][k].imag;
1164 buf_1[p].real = tmp_a_r + tmp_b_r;
1165 buf_1[p].imag = tmp_a_i + tmp_b_i;
1166 buf_1[q].real = tmp_a_r - tmp_b_r;
1167 buf_1[q].imag = tmp_a_i - tmp_b_i;
1169 /* Do block 2 */
1170 tmp_a_r = buf_2[p].real;
1171 tmp_a_i = buf_2[p].imag;
1172 tmp_b_r = buf_2[q].real * w[m][k].real - buf_2[q].imag * w[m][k].imag;
1173 tmp_b_i = buf_2[q].imag * w[m][k].real + buf_2[q].real * w[m][k].imag;
1174 buf_2[p].real = tmp_a_r + tmp_b_r;
1175 buf_2[p].imag = tmp_a_i + tmp_b_i;
1176 buf_2[q].real = tmp_a_r - tmp_b_r;
1177 buf_2[q].imag = tmp_a_i - tmp_b_i;
1182 /* Post IFFT complex multiply */
1183 for( i=0; i < 64; i++) {
1184 /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
1185 tmp_a_r = buf_1[i].real;
1186 tmp_a_i = -buf_1[i].imag;
1187 buf_1[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
1188 buf_1[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
1189 /* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */
1190 tmp_a_r = buf_2[i].real;
1191 tmp_a_i = -buf_2[i].imag;
1192 buf_2[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
1193 buf_2[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
1196 data_ptr = data;
1197 delay_ptr = delay;
1198 window_ptr = imdct_window;
1200 /* Window and convert to real valued signal */
1201 for(i=0; i< 64; i++) {
1202 *data_ptr++ = -buf_1[i].imag * *window_ptr++ + *delay_ptr++ + bias;
1203 *data_ptr++ = buf_1[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
1206 for(i=0; i< 64; i++) {
1207 *data_ptr++ = -buf_1[i].real * *window_ptr++ + *delay_ptr++ + bias;
1208 *data_ptr++ = buf_1[64-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
1211 delay_ptr = delay;
1213 for(i=0; i< 64; i++) {
1214 *delay_ptr++ = -buf_2[i].real * *--window_ptr;
1215 *delay_ptr++ = buf_2[64-i-1].imag * *--window_ptr;
1218 for(i=0; i< 64; i++) {
1219 *delay_ptr++ = buf_2[i].imag * *--window_ptr;
1220 *delay_ptr++ = -buf_2[64-i-1].real * *--window_ptr;
1224 void imdct_init (uint32_t mm_accel)
1226 #ifdef LIBA52_MLIB
1227 if (mm_accel & MM_ACCEL_MLIB) {
1228 fprintf (stderr, "Using mlib for IMDCT transform\n");
1229 imdct_512 = imdct_do_512_mlib;
1230 imdct_256 = imdct_do_256_mlib;
1231 } else
1232 #endif
1234 int i, j, k;
1236 /* Twiddle factors to turn IFFT into IMDCT */
1237 for (i = 0; i < 128; i++) {
1238 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
1239 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
1241 #ifdef ARCH_X86
1242 for (i = 0; i < 128; i++) {
1243 sseSinCos1c[2*i+0]= xcos1[i];
1244 sseSinCos1c[2*i+1]= -xcos1[i];
1245 sseSinCos1d[2*i+0]= xsin1[i];
1246 sseSinCos1d[2*i+1]= xsin1[i];
1248 #endif
1250 /* More twiddle factors to turn IFFT into IMDCT */
1251 for (i = 0; i < 64; i++) {
1252 xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1));
1253 xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1));
1256 for (i = 0; i < 7; i++) {
1257 j = 1 << i;
1258 for (k = 0; k < j; k++) {
1259 w[i][k].real = cos (-M_PI * k / j);
1260 w[i][k].imag = sin (-M_PI * k / j);
1263 #ifdef ARCH_X86
1264 for (i = 1; i < 7; i++) {
1265 j = 1 << i;
1266 for (k = 0; k < j; k+=2) {
1268 sseW[i][4*k + 0] = w[i][k+0].real;
1269 sseW[i][4*k + 1] = w[i][k+0].real;
1270 sseW[i][4*k + 2] = w[i][k+1].real;
1271 sseW[i][4*k + 3] = w[i][k+1].real;
1273 sseW[i][4*k + 4] = -w[i][k+0].imag;
1274 sseW[i][4*k + 5] = w[i][k+0].imag;
1275 sseW[i][4*k + 6] = -w[i][k+1].imag;
1276 sseW[i][4*k + 7] = w[i][k+1].imag;
1278 //we multiply more or less uninitalized numbers so we need to use exactly 0.0
1279 if(k==0)
1281 // sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
1282 sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
1285 if(2*k == j)
1287 sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
1288 // sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
1293 for(i=0; i<128; i++)
1295 sseWindow[2*i+0]= -imdct_window[2*i+0];
1296 sseWindow[2*i+1]= imdct_window[2*i+1];
1299 for(i=0; i<64; i++)
1301 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1];
1302 sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0];
1303 sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1];
1304 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0];
1306 #endif // arch_x86
1308 imdct_512 = imdct_do_512;
1309 #ifdef ARCH_X86
1310 if(mm_accel & MM_ACCEL_X86_SSE)
1312 fprintf (stderr, "Using SSE optimized IMDCT transform\n");
1313 imdct_512 = imdct_do_512_sse;
1315 else
1316 if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
1318 fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
1319 imdct_512 = imdct_do_512_3dnowex;
1321 else
1322 if(mm_accel & MM_ACCEL_X86_3DNOW)
1324 fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
1325 imdct_512 = imdct_do_512_3dnow;
1327 else
1328 #endif // arch_x86
1329 #ifdef HAVE_ALTIVEC
1330 if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
1332 fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
1333 imdct_512 = imdct_do_512_altivec;
1335 else
1336 #endif
1337 fprintf (stderr, "No accelerated IMDCT transform found\n");
1338 imdct_256 = imdct_do_256;
1342 static void fft_asmb(int k, complex_t *x, complex_t *wTB,
1343 const complex_t *d, const complex_t *d_3)
1345 register complex_t *x2k, *x3k, *x4k, *wB;
1346 register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
1348 x2k = x + 2 * k;
1349 x3k = x2k + 2 * k;
1350 x4k = x3k + 2 * k;
1351 wB = wTB + 2 * k;
1353 TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
1354 TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
1356 --k;
1357 for(;;) {
1358 TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]);
1359 TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]);
1360 if (!--k) break;
1361 x += 2;
1362 x2k += 2;
1363 x3k += 2;
1364 x4k += 2;
1365 d += 2;
1366 d_3 += 2;
1367 wTB += 2;
1368 wB += 2;
1373 static void fft_asmb16(complex_t *x, complex_t *wTB)
1375 register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
1376 int k = 2;
1378 /* transform x[0], x[8], x[4], x[12] */
1379 TRANSZERO(x[0],x[4],x[8],x[12]);
1381 /* transform x[1], x[9], x[5], x[13] */
1382 TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]);
1384 /* transform x[2], x[10], x[6], x[14] */
1385 TRANSHALF_16(x[2],x[6],x[10],x[14]);
1387 /* transform x[3], x[11], x[7], x[15] */
1388 TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]);
1392 static void fft_4(complex_t *x)
1394 /* delta_p = 1 here */
1395 /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
1398 register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
1400 yt_r = x[0].real;
1401 yb_r = yt_r - x[2].real;
1402 yt_r += x[2].real;
1404 u_r = x[1].real;
1405 vi_i = x[3].real - u_r;
1406 u_r += x[3].real;
1408 u_i = x[1].imag;
1409 vi_r = u_i - x[3].imag;
1410 u_i += x[3].imag;
1412 yt_i = yt_r;
1413 yt_i += u_r;
1414 x[0].real = yt_i;
1415 yt_r -= u_r;
1416 x[2].real = yt_r;
1417 yt_i = yb_r;
1418 yt_i += vi_r;
1419 x[1].real = yt_i;
1420 yb_r -= vi_r;
1421 x[3].real = yb_r;
1423 yt_i = x[0].imag;
1424 yb_i = yt_i - x[2].imag;
1425 yt_i += x[2].imag;
1427 yt_r = yt_i;
1428 yt_r += u_i;
1429 x[0].imag = yt_r;
1430 yt_i -= u_i;
1431 x[2].imag = yt_i;
1432 yt_r = yb_i;
1433 yt_r += vi_i;
1434 x[1].imag = yt_r;
1435 yb_i -= vi_i;
1436 x[3].imag = yb_i;
1440 static void fft_8(complex_t *x)
1442 /* delta_p = diag{1, sqrt(i)} here */
1443 /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8}
1445 register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
1447 wT1_r = x[1].real;
1448 wT1_i = x[1].imag;
1449 wB1_r = x[3].real;
1450 wB1_i = x[3].imag;
1452 x[1] = x[2];
1453 x[2] = x[4];
1454 x[3] = x[6];
1455 fft_4(&x[0]);
1458 /* x[0] x[4] */
1459 wT2_r = x[5].real;
1460 wT2_r += x[7].real;
1461 wT2_r += wT1_r;
1462 wT2_r += wB1_r;
1463 wT2_i = wT2_r;
1464 wT2_r += x[0].real;
1465 wT2_i = x[0].real - wT2_i;
1466 x[0].real = wT2_r;
1467 x[4].real = wT2_i;
1469 wT2_i = x[5].imag;
1470 wT2_i += x[7].imag;
1471 wT2_i += wT1_i;
1472 wT2_i += wB1_i;
1473 wT2_r = wT2_i;
1474 wT2_r += x[0].imag;
1475 wT2_i = x[0].imag - wT2_i;
1476 x[0].imag = wT2_r;
1477 x[4].imag = wT2_i;
1479 /* x[2] x[6] */
1480 wT2_r = x[5].imag;
1481 wT2_r -= x[7].imag;
1482 wT2_r += wT1_i;
1483 wT2_r -= wB1_i;
1484 wT2_i = wT2_r;
1485 wT2_r += x[2].real;
1486 wT2_i = x[2].real - wT2_i;
1487 x[2].real = wT2_r;
1488 x[6].real = wT2_i;
1490 wT2_i = x[5].real;
1491 wT2_i -= x[7].real;
1492 wT2_i += wT1_r;
1493 wT2_i -= wB1_r;
1494 wT2_r = wT2_i;
1495 wT2_r += x[2].imag;
1496 wT2_i = x[2].imag - wT2_i;
1497 x[2].imag = wT2_i;
1498 x[6].imag = wT2_r;
1501 /* x[1] x[5] */
1502 wT2_r = wT1_r;
1503 wT2_r += wB1_i;
1504 wT2_r -= x[5].real;
1505 wT2_r -= x[7].imag;
1506 wT2_i = wT1_i;
1507 wT2_i -= wB1_r;
1508 wT2_i -= x[5].imag;
1509 wT2_i += x[7].real;
1511 wB2_r = wT2_r;
1512 wB2_r += wT2_i;
1513 wT2_i -= wT2_r;
1514 wB2_r *= HSQRT2;
1515 wT2_i *= HSQRT2;
1516 wT2_r = wB2_r;
1517 wB2_r += x[1].real;
1518 wT2_r = x[1].real - wT2_r;
1520 wB2_i = x[5].real;
1521 x[1].real = wB2_r;
1522 x[5].real = wT2_r;
1524 wT2_r = wT2_i;
1525 wT2_r += x[1].imag;
1526 wT2_i = x[1].imag - wT2_i;
1527 wB2_r = x[5].imag;
1528 x[1].imag = wT2_r;
1529 x[5].imag = wT2_i;
1531 /* x[3] x[7] */
1532 wT1_r -= wB1_i;
1533 wT1_i += wB1_r;
1534 wB1_r = wB2_i - x[7].imag;
1535 wB1_i = wB2_r + x[7].real;
1536 wT1_r -= wB1_r;
1537 wT1_i -= wB1_i;
1538 wB1_r = wT1_r + wT1_i;
1539 wB1_r *= HSQRT2;
1540 wT1_i -= wT1_r;
1541 wT1_i *= HSQRT2;
1542 wB2_r = x[3].real;
1543 wB2_i = wB2_r + wT1_i;
1544 wB2_r -= wT1_i;
1545 x[3].real = wB2_i;
1546 x[7].real = wB2_r;
1547 wB2_i = x[3].imag;
1548 wB2_r = wB2_i + wB1_r;
1549 wB2_i -= wB1_r;
1550 x[3].imag = wB2_i;
1551 x[7].imag = wB2_r;
1555 static void fft_128p(complex_t *a)
1557 fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]);
1558 fft_asmb16(&a[0], &a[8]);
1560 fft_8(&a[16]), fft_8(&a[24]);
1561 fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
1563 fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]);
1564 fft_asmb16(&a[32], &a[40]);
1566 fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]);
1567 fft_asmb16(&a[48], &a[56]);
1569 fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
1571 fft_8(&a[64]); fft_4(&a[72]); fft_4(&a[76]);
1572 /* fft_16(&a[64]); */
1573 fft_asmb16(&a[64], &a[72]);
1575 fft_8(&a[80]); fft_8(&a[88]);
1577 /* fft_32(&a[64]); */
1578 fft_asmb(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
1580 fft_8(&a[96]); fft_4(&a[104]), fft_4(&a[108]);
1581 /* fft_16(&a[96]); */
1582 fft_asmb16(&a[96], &a[104]);
1584 fft_8(&a[112]), fft_8(&a[120]);
1585 /* fft_32(&a[96]); */
1586 fft_asmb(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
1588 /* fft_128(&a[0]); */
1589 fft_asmb(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);