tremor uses integer types
[mplayer/glamo.git] / liba52 / imdct.c
blobce8cf247438d9ce7f77abecc96a54e9d627e1445
1 /*
2 * imdct.c
3 * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
6 * This file is part of a52dec, a free ATSC A-52 stream decoder.
7 * See http://liba52.sourceforge.net/ for updates.
9 * Modified for use with MPlayer, changes contained in liba52_changes.diff.
10 * detailed CVS changelog at http://www.mplayerhq.hu/cgi-bin/cvsweb.cgi/main/
11 * $Id$
13 * a52dec is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * a52dec is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
28 * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
29 * michael did port them from libac3 (untested, perhaps totally broken)
30 * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
33 #include "config.h"
35 #include <math.h>
36 #include <stdio.h>
37 #ifndef M_PI
38 #define M_PI 3.1415926535897932384626433832795029
39 #endif
40 #include <inttypes.h>
42 #include "a52.h"
43 #include "a52_internal.h"
44 #include "mm_accel.h"
45 #include "mangle.h"
47 #ifdef RUNTIME_CPUDETECT
48 #undef HAVE_3DNOWEX
49 #endif
51 #define USE_AC3_C
53 void (* imdct_256) (sample_t data[], sample_t delay[], sample_t bias);
54 void (* imdct_512) (sample_t data[], sample_t delay[], sample_t bias);
56 typedef struct complex_s {
57 sample_t real;
58 sample_t imag;
59 } complex_t;
61 static void fft_128p(complex_t *a);
63 static const int pm128[128] attribute_used __attribute__((aligned(16))) =
65 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
66 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,
67 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,
68 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,
69 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,
70 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
71 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
72 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
73 };
75 /* 128 point bit-reverse LUT */
76 static uint8_t attribute_used bit_reverse_512[] = {
77 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
78 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
79 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
80 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
81 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
82 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
83 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
84 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
85 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
86 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
87 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
88 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
89 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
90 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
91 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
92 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
94 static uint8_t bit_reverse_256[] = {
95 0x00, 0x20, 0x10, 0x30, 0x08, 0x28, 0x18, 0x38,
96 0x04, 0x24, 0x14, 0x34, 0x0c, 0x2c, 0x1c, 0x3c,
97 0x02, 0x22, 0x12, 0x32, 0x0a, 0x2a, 0x1a, 0x3a,
98 0x06, 0x26, 0x16, 0x36, 0x0e, 0x2e, 0x1e, 0x3e,
99 0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39,
100 0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d,
101 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b,
102 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
104 #if defined(ARCH_X86) || defined(ARCH_X86_64)
105 // NOTE: SSE needs 16byte alignment or it will segfault
107 static complex_t __attribute__((aligned(16))) buf[128];
108 static float __attribute__((aligned(16))) sseSinCos1c[256];
109 static float __attribute__((aligned(16))) sseSinCos1d[256];
110 static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
111 //static float __attribute__((aligned(16))) sseW0[4];
112 static float __attribute__((aligned(16))) sseW1[8];
113 static float __attribute__((aligned(16))) sseW2[16];
114 static float __attribute__((aligned(16))) sseW3[32];
115 static float __attribute__((aligned(16))) sseW4[64];
116 static float __attribute__((aligned(16))) sseW5[128];
117 static float __attribute__((aligned(16))) sseW6[256];
118 static float __attribute__((aligned(16))) *sseW[7]=
119 {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
120 static float __attribute__((aligned(16))) sseWindow[512];
121 #else
122 static complex_t __attribute__((aligned(16))) buf[128];
123 #endif
125 /* Twiddle factor LUT */
126 static complex_t __attribute__((aligned(16))) w_1[1];
127 static complex_t __attribute__((aligned(16))) w_2[2];
128 static complex_t __attribute__((aligned(16))) w_4[4];
129 static complex_t __attribute__((aligned(16))) w_8[8];
130 static complex_t __attribute__((aligned(16))) w_16[16];
131 static complex_t __attribute__((aligned(16))) w_32[32];
132 static complex_t __attribute__((aligned(16))) w_64[64];
133 static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
135 /* Twiddle factors for IMDCT */
136 static sample_t __attribute__((aligned(16))) xcos1[128];
137 static sample_t __attribute__((aligned(16))) xsin1[128];
138 static sample_t __attribute__((aligned(16))) xcos2[64];
139 static sample_t __attribute__((aligned(16))) xsin2[64];
141 /* Windowing function for Modified DCT - Thank you acroread */
142 sample_t imdct_window[] = {
143 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
144 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
145 0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
146 0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
147 0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
148 0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
149 0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
150 0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
151 0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
152 0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
153 0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
154 0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
155 0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
156 0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
157 0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
158 0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
159 0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
160 0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
161 0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
162 0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
163 0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
164 0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
165 0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
166 0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
167 0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
168 0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
169 0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
170 0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
171 0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
172 0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
173 0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
174 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 };
177 static inline void swap_cmplx(complex_t *a, complex_t *b)
179 complex_t tmp;
181 tmp = *a;
182 *a = *b;
183 *b = tmp;
188 static inline complex_t cmplx_mult(complex_t a, complex_t b)
190 complex_t ret;
192 ret.real = a.real * b.real - a.imag * b.imag;
193 ret.imag = a.real * b.imag + a.imag * b.real;
195 return ret;
198 void
199 imdct_do_512(sample_t data[],sample_t delay[], sample_t bias)
201 int i;
202 #ifndef USE_AC3_C
203 int k;
204 int p,q;
205 int m;
206 int two_m;
207 int two_m_plus_one;
209 sample_t tmp_b_i;
210 sample_t tmp_b_r;
211 #endif
212 sample_t tmp_a_i;
213 sample_t tmp_a_r;
215 sample_t *data_ptr;
216 sample_t *delay_ptr;
217 sample_t *window_ptr;
219 /* 512 IMDCT with source and dest data in 'data' */
221 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
222 for( i=0; i < 128; i++) {
223 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
224 #ifdef USE_AC3_C
225 int j= pm128[i];
226 #else
227 int j= bit_reverse_512[i];
228 #endif
229 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
230 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
233 /* FFT Merge */
234 /* unoptimized variant
235 for (m=1; m < 7; m++) {
236 if(m)
237 two_m = (1 << m);
238 else
239 two_m = 1;
241 two_m_plus_one = (1 << (m+1));
243 for(i = 0; i < 128; i += two_m_plus_one) {
244 for(k = 0; k < two_m; k++) {
245 p = k + i;
246 q = p + two_m;
247 tmp_a_r = buf[p].real;
248 tmp_a_i = buf[p].imag;
249 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
250 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
251 buf[p].real = tmp_a_r + tmp_b_r;
252 buf[p].imag = tmp_a_i + tmp_b_i;
253 buf[q].real = tmp_a_r - tmp_b_r;
254 buf[q].imag = tmp_a_i - tmp_b_i;
259 #ifdef USE_AC3_C
260 fft_128p (&buf[0]);
261 #else
263 /* 1. iteration */
264 for(i = 0; i < 128; i += 2) {
265 tmp_a_r = buf[i].real;
266 tmp_a_i = buf[i].imag;
267 tmp_b_r = buf[i+1].real;
268 tmp_b_i = buf[i+1].imag;
269 buf[i].real = tmp_a_r + tmp_b_r;
270 buf[i].imag = tmp_a_i + tmp_b_i;
271 buf[i+1].real = tmp_a_r - tmp_b_r;
272 buf[i+1].imag = tmp_a_i - tmp_b_i;
275 /* 2. iteration */
276 // Note w[1]={{1,0}, {0,-1}}
277 for(i = 0; i < 128; i += 4) {
278 tmp_a_r = buf[i].real;
279 tmp_a_i = buf[i].imag;
280 tmp_b_r = buf[i+2].real;
281 tmp_b_i = buf[i+2].imag;
282 buf[i].real = tmp_a_r + tmp_b_r;
283 buf[i].imag = tmp_a_i + tmp_b_i;
284 buf[i+2].real = tmp_a_r - tmp_b_r;
285 buf[i+2].imag = tmp_a_i - tmp_b_i;
286 tmp_a_r = buf[i+1].real;
287 tmp_a_i = buf[i+1].imag;
288 tmp_b_r = buf[i+3].imag;
289 tmp_b_i = buf[i+3].real;
290 buf[i+1].real = tmp_a_r + tmp_b_r;
291 buf[i+1].imag = tmp_a_i - tmp_b_i;
292 buf[i+3].real = tmp_a_r - tmp_b_r;
293 buf[i+3].imag = tmp_a_i + tmp_b_i;
296 /* 3. iteration */
297 for(i = 0; i < 128; i += 8) {
298 tmp_a_r = buf[i].real;
299 tmp_a_i = buf[i].imag;
300 tmp_b_r = buf[i+4].real;
301 tmp_b_i = buf[i+4].imag;
302 buf[i].real = tmp_a_r + tmp_b_r;
303 buf[i].imag = tmp_a_i + tmp_b_i;
304 buf[i+4].real = tmp_a_r - tmp_b_r;
305 buf[i+4].imag = tmp_a_i - tmp_b_i;
306 tmp_a_r = buf[1+i].real;
307 tmp_a_i = buf[1+i].imag;
308 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
309 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
310 buf[1+i].real = tmp_a_r + tmp_b_r;
311 buf[1+i].imag = tmp_a_i + tmp_b_i;
312 buf[i+5].real = tmp_a_r - tmp_b_r;
313 buf[i+5].imag = tmp_a_i - tmp_b_i;
314 tmp_a_r = buf[i+2].real;
315 tmp_a_i = buf[i+2].imag;
316 tmp_b_r = buf[i+6].imag;
317 tmp_b_i = - buf[i+6].real;
318 buf[i+2].real = tmp_a_r + tmp_b_r;
319 buf[i+2].imag = tmp_a_i + tmp_b_i;
320 buf[i+6].real = tmp_a_r - tmp_b_r;
321 buf[i+6].imag = tmp_a_i - tmp_b_i;
322 tmp_a_r = buf[i+3].real;
323 tmp_a_i = buf[i+3].imag;
324 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
325 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
326 buf[i+3].real = tmp_a_r + tmp_b_r;
327 buf[i+3].imag = tmp_a_i + tmp_b_i;
328 buf[i+7].real = tmp_a_r - tmp_b_r;
329 buf[i+7].imag = tmp_a_i - tmp_b_i;
332 /* 4-7. iterations */
333 for (m=3; m < 7; m++) {
334 two_m = (1 << m);
336 two_m_plus_one = two_m<<1;
338 for(i = 0; i < 128; i += two_m_plus_one) {
339 for(k = 0; k < two_m; k++) {
340 int p = k + i;
341 int q = p + two_m;
342 tmp_a_r = buf[p].real;
343 tmp_a_i = buf[p].imag;
344 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
345 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
346 buf[p].real = tmp_a_r + tmp_b_r;
347 buf[p].imag = tmp_a_i + tmp_b_i;
348 buf[q].real = tmp_a_r - tmp_b_r;
349 buf[q].imag = tmp_a_i - tmp_b_i;
353 #endif
354 /* Post IFFT complex multiply plus IFFT complex conjugate*/
355 for( i=0; i < 128; i++) {
356 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
357 tmp_a_r = buf[i].real;
358 tmp_a_i = -1.0 * buf[i].imag;
359 buf[i].real =(tmp_a_r * xcos1[i]) - (tmp_a_i * xsin1[i]);
360 buf[i].imag =(tmp_a_r * xsin1[i]) + (tmp_a_i * xcos1[i]);
363 data_ptr = data;
364 delay_ptr = delay;
365 window_ptr = imdct_window;
367 /* Window and convert to real valued signal */
368 for(i=0; i< 64; i++) {
369 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
370 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
373 for(i=0; i< 64; i++) {
374 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
375 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
378 /* The trailing edge of the window goes into the delay line */
379 delay_ptr = delay;
381 for(i=0; i< 64; i++) {
382 *delay_ptr++ = -buf[64+i].real * *--window_ptr;
383 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
386 for(i=0; i<64; i++) {
387 *delay_ptr++ = buf[i].imag * *--window_ptr;
388 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
392 #ifdef HAVE_ALTIVEC
394 #ifndef SYS_DARWIN
395 #include <altivec.h>
396 #endif
398 // used to build registers permutation vectors (vcprm)
399 // the 's' are for words in the _s_econd vector
400 #define WORD_0 0x00,0x01,0x02,0x03
401 #define WORD_1 0x04,0x05,0x06,0x07
402 #define WORD_2 0x08,0x09,0x0a,0x0b
403 #define WORD_3 0x0c,0x0d,0x0e,0x0f
404 #define WORD_s0 0x10,0x11,0x12,0x13
405 #define WORD_s1 0x14,0x15,0x16,0x17
406 #define WORD_s2 0x18,0x19,0x1a,0x1b
407 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
409 #ifdef SYS_DARWIN
410 #define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
411 #else
412 #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
413 #endif
415 // vcprmle is used to keep the same index as in the SSE version.
416 // it's the same as vcprm, with the index inversed
417 // ('le' is Little Endian)
418 #define vcprmle(a,b,c,d) vcprm(d,c,b,a)
420 // used to build inverse/identity vectors (vcii)
421 // n is _n_egative, p is _p_ositive
422 #define FLOAT_n -1.
423 #define FLOAT_p 1.
425 #ifdef SYS_DARWIN
426 #define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
427 #else
428 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
429 #endif
431 #ifdef SYS_DARWIN
432 #define FOUROF(a) (a)
433 #else
434 #define FOUROF(a) {a,a,a,a}
435 #endif
438 void
439 imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
441 int i;
442 int k;
443 int p,q;
444 int m;
445 long two_m;
446 long two_m_plus_one;
448 sample_t tmp_b_i;
449 sample_t tmp_b_r;
450 sample_t tmp_a_i;
451 sample_t tmp_a_r;
453 sample_t *data_ptr;
454 sample_t *delay_ptr;
455 sample_t *window_ptr;
457 /* 512 IMDCT with source and dest data in 'data' */
459 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
460 for( i=0; i < 128; i++) {
461 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
462 int j= bit_reverse_512[i];
463 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
464 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
467 /* 1. iteration */
468 for(i = 0; i < 128; i += 2) {
469 #if 0
470 tmp_a_r = buf[i].real;
471 tmp_a_i = buf[i].imag;
472 tmp_b_r = buf[i+1].real;
473 tmp_b_i = buf[i+1].imag;
474 buf[i].real = tmp_a_r + tmp_b_r;
475 buf[i].imag = tmp_a_i + tmp_b_i;
476 buf[i+1].real = tmp_a_r - tmp_b_r;
477 buf[i+1].imag = tmp_a_i - tmp_b_i;
478 #else
479 vector float temp, bufv;
481 bufv = vec_ld(i << 3, (float*)buf);
482 temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
483 bufv = vec_madd(bufv, vcii(p,p,n,n), temp);
484 vec_st(bufv, i << 3, (float*)buf);
485 #endif
488 /* 2. iteration */
489 // Note w[1]={{1,0}, {0,-1}}
490 for(i = 0; i < 128; i += 4) {
491 #if 0
492 tmp_a_r = buf[i].real;
493 tmp_a_i = buf[i].imag;
494 tmp_b_r = buf[i+2].real;
495 tmp_b_i = buf[i+2].imag;
496 buf[i].real = tmp_a_r + tmp_b_r;
497 buf[i].imag = tmp_a_i + tmp_b_i;
498 buf[i+2].real = tmp_a_r - tmp_b_r;
499 buf[i+2].imag = tmp_a_i - tmp_b_i;
500 tmp_a_r = buf[i+1].real;
501 tmp_a_i = buf[i+1].imag;
502 /* WARNING: im <-> re here ! */
503 tmp_b_r = buf[i+3].imag;
504 tmp_b_i = buf[i+3].real;
505 buf[i+1].real = tmp_a_r + tmp_b_r;
506 buf[i+1].imag = tmp_a_i - tmp_b_i;
507 buf[i+3].real = tmp_a_r - tmp_b_r;
508 buf[i+3].imag = tmp_a_i + tmp_b_i;
509 #else
510 vector float buf01, buf23, temp1, temp2;
512 buf01 = vec_ld((i + 0) << 3, (float*)buf);
513 buf23 = vec_ld((i + 2) << 3, (float*)buf);
514 buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
516 temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);
517 temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);
519 vec_st(temp1, (i + 0) << 3, (float*)buf);
520 vec_st(temp2, (i + 2) << 3, (float*)buf);
521 #endif
524 /* 3. iteration */
525 for(i = 0; i < 128; i += 8) {
526 #if 0
527 tmp_a_r = buf[i].real;
528 tmp_a_i = buf[i].imag;
529 tmp_b_r = buf[i+4].real;
530 tmp_b_i = buf[i+4].imag;
531 buf[i].real = tmp_a_r + tmp_b_r;
532 buf[i].imag = tmp_a_i + tmp_b_i;
533 buf[i+4].real = tmp_a_r - tmp_b_r;
534 buf[i+4].imag = tmp_a_i - tmp_b_i;
535 tmp_a_r = buf[1+i].real;
536 tmp_a_i = buf[1+i].imag;
537 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
538 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
539 buf[1+i].real = tmp_a_r + tmp_b_r;
540 buf[1+i].imag = tmp_a_i + tmp_b_i;
541 buf[i+5].real = tmp_a_r - tmp_b_r;
542 buf[i+5].imag = tmp_a_i - tmp_b_i;
543 tmp_a_r = buf[i+2].real;
544 tmp_a_i = buf[i+2].imag;
545 /* WARNING re <-> im & sign */
546 tmp_b_r = buf[i+6].imag;
547 tmp_b_i = - buf[i+6].real;
548 buf[i+2].real = tmp_a_r + tmp_b_r;
549 buf[i+2].imag = tmp_a_i + tmp_b_i;
550 buf[i+6].real = tmp_a_r - tmp_b_r;
551 buf[i+6].imag = tmp_a_i - tmp_b_i;
552 tmp_a_r = buf[i+3].real;
553 tmp_a_i = buf[i+3].imag;
554 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
555 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
556 buf[i+3].real = tmp_a_r + tmp_b_r;
557 buf[i+3].imag = tmp_a_i + tmp_b_i;
558 buf[i+7].real = tmp_a_r - tmp_b_r;
559 buf[i+7].imag = tmp_a_i - tmp_b_i;
560 #else
561 vector float buf01, buf23, buf45, buf67;
563 buf01 = vec_ld((i + 0) << 3, (float*)buf);
564 buf23 = vec_ld((i + 2) << 3, (float*)buf);
566 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
567 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
568 buf[i+5].real = tmp_b_r;
569 buf[i+5].imag = tmp_b_i;
570 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
571 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
572 buf[i+7].real = tmp_b_r;
573 buf[i+7].imag = tmp_b_i;
575 buf23 = vec_ld((i + 2) << 3, (float*)buf);
576 buf45 = vec_ld((i + 4) << 3, (float*)buf);
577 buf67 = vec_ld((i + 6) << 3, (float*)buf);
578 buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
580 vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
581 vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
582 vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
583 vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
584 #endif
587 /* 4-7. iterations */
588 for (m=3; m < 7; m++) {
589 two_m = (1 << m);
591 two_m_plus_one = two_m<<1;
593 for(i = 0; i < 128; i += two_m_plus_one) {
594 for(k = 0; k < two_m; k+=2) {
595 #if 0
596 int p = k + i;
597 int q = p + two_m;
598 tmp_a_r = buf[p].real;
599 tmp_a_i = buf[p].imag;
600 tmp_b_r =
601 buf[q].real * w[m][k].real -
602 buf[q].imag * w[m][k].imag;
603 tmp_b_i =
604 buf[q].imag * w[m][k].real +
605 buf[q].real * w[m][k].imag;
606 buf[p].real = tmp_a_r + tmp_b_r;
607 buf[p].imag = tmp_a_i + tmp_b_i;
608 buf[q].real = tmp_a_r - tmp_b_r;
609 buf[q].imag = tmp_a_i - tmp_b_i;
611 tmp_a_r = buf[(p + 1)].real;
612 tmp_a_i = buf[(p + 1)].imag;
613 tmp_b_r =
614 buf[(q + 1)].real * w[m][(k + 1)].real -
615 buf[(q + 1)].imag * w[m][(k + 1)].imag;
616 tmp_b_i =
617 buf[(q + 1)].imag * w[m][(k + 1)].real +
618 buf[(q + 1)].real * w[m][(k + 1)].imag;
619 buf[(p + 1)].real = tmp_a_r + tmp_b_r;
620 buf[(p + 1)].imag = tmp_a_i + tmp_b_i;
621 buf[(q + 1)].real = tmp_a_r - tmp_b_r;
622 buf[(q + 1)].imag = tmp_a_i - tmp_b_i;
623 #else
624 int p = k + i;
625 int q = p + two_m;
626 vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;
627 const vector float vczero = (const vector float)FOUROF(0.);
628 // first compute buf[q] and buf[q+1]
629 vecq = vec_ld(q << 3, (float*)buf);
630 vecw = vec_ld(0, (float*)&(w[m][k]));
631 temp1 = vec_madd(vecq, vecw, vczero);
632 temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));
633 temp2 = vec_madd(temp2, vecw, vczero);
634 temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));
635 temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));
636 vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
637 // then butterfly with buf[p] and buf[p+1]
638 vecp = vec_ld(p << 3, (float*)buf);
640 temp1 = vec_add(vecp, vecq);
641 temp2 = vec_sub(vecp, vecq);
643 vec_st(temp1, p << 3, (float*)buf);
644 vec_st(temp2, q << 3, (float*)buf);
645 #endif
650 /* Post IFFT complex multiply plus IFFT complex conjugate*/
651 for( i=0; i < 128; i+=4) {
652 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
653 #if 0
654 tmp_a_r = buf[(i + 0)].real;
655 tmp_a_i = -1.0 * buf[(i + 0)].imag;
656 buf[(i + 0)].real =
657 (tmp_a_r * xcos1[(i + 0)]) - (tmp_a_i * xsin1[(i + 0)]);
658 buf[(i + 0)].imag =
659 (tmp_a_r * xsin1[(i + 0)]) + (tmp_a_i * xcos1[(i + 0)]);
661 tmp_a_r = buf[(i + 1)].real;
662 tmp_a_i = -1.0 * buf[(i + 1)].imag;
663 buf[(i + 1)].real =
664 (tmp_a_r * xcos1[(i + 1)]) - (tmp_a_i * xsin1[(i + 1)]);
665 buf[(i + 1)].imag =
666 (tmp_a_r * xsin1[(i + 1)]) + (tmp_a_i * xcos1[(i + 1)]);
668 tmp_a_r = buf[(i + 2)].real;
669 tmp_a_i = -1.0 * buf[(i + 2)].imag;
670 buf[(i + 2)].real =
671 (tmp_a_r * xcos1[(i + 2)]) - (tmp_a_i * xsin1[(i + 2)]);
672 buf[(i + 2)].imag =
673 (tmp_a_r * xsin1[(i + 2)]) + (tmp_a_i * xcos1[(i + 2)]);
675 tmp_a_r = buf[(i + 3)].real;
676 tmp_a_i = -1.0 * buf[(i + 3)].imag;
677 buf[(i + 3)].real =
678 (tmp_a_r * xcos1[(i + 3)]) - (tmp_a_i * xsin1[(i + 3)]);
679 buf[(i + 3)].imag =
680 (tmp_a_r * xsin1[(i + 3)]) + (tmp_a_i * xcos1[(i + 3)]);
681 #else
682 vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;
683 vector float temp0022, temp1133, tempCS01;
684 const vector float vczero = (const vector float)FOUROF(0.);
686 bufv_0 = vec_ld((i + 0) << 3, (float*)buf);
687 bufv_2 = vec_ld((i + 2) << 3, (float*)buf);
689 cosv = vec_ld(i << 2, xcos1);
690 sinv = vec_ld(i << 2, xsin1);
692 temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));
693 temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));
694 tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));
695 temp1 = vec_madd(temp0022, tempCS01, vczero);
696 tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
697 temp2 = vec_madd(temp1133, tempCS01, vczero);
698 bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
700 vec_st(bufv_0, (i + 0) << 3, (float*)buf);
702 /* idem with bufv_2 and high-order cosv/sinv */
704 temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));
705 temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));
706 tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));
707 temp1 = vec_madd(temp0022, tempCS01, vczero);
708 tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));
709 temp2 = vec_madd(temp1133, tempCS01, vczero);
710 bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
712 vec_st(bufv_2, (i + 2) << 3, (float*)buf);
714 #endif
717 data_ptr = data;
718 delay_ptr = delay;
719 window_ptr = imdct_window;
721 /* Window and convert to real valued signal */
722 for(i=0; i< 64; i++) {
723 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
724 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
727 for(i=0; i< 64; i++) {
728 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
729 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
732 /* The trailing edge of the window goes into the delay line */
733 delay_ptr = delay;
735 for(i=0; i< 64; i++) {
736 *delay_ptr++ = -buf[64+i].real * *--window_ptr;
737 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
740 for(i=0; i<64; i++) {
741 *delay_ptr++ = buf[i].imag * *--window_ptr;
742 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
745 #endif
748 // Stuff below this line is borrowed from libac3
749 #include "srfftp.h"
750 #if defined(ARCH_X86) || defined(ARCH_X86_64)
751 #ifndef HAVE_3DNOW
752 #define HAVE_3DNOW 1
753 #endif
754 #include "srfftp_3dnow.h"
756 const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
757 const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
758 const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
760 #undef HAVE_3DNOWEX
761 #include "imdct_3dnow.h"
762 #define HAVE_3DNOWEX
763 #include "imdct_3dnow.h"
765 void
766 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
768 /* int i,k;
769 int p,q;*/
770 int m;
771 long two_m;
772 long two_m_plus_one;
773 long two_m_plus_one_shl3;
774 complex_t *buf_offset;
776 /* sample_t tmp_a_i;
777 sample_t tmp_a_r;
778 sample_t tmp_b_i;
779 sample_t tmp_b_r;*/
781 sample_t *data_ptr;
782 sample_t *delay_ptr;
783 sample_t *window_ptr;
785 /* 512 IMDCT with source and dest data in 'data' */
786 /* see the c version (dct_do_512()), its allmost identical, just in C */
788 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
789 /* Bit reversed shuffling */
790 asm volatile(
791 "xor %%"REG_S", %%"REG_S" \n\t"
792 "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t"
793 "mov $1008, %%"REG_D" \n\t"
794 "push %%"REG_BP" \n\t" //use ebp without telling gcc
795 ".balign 16 \n\t"
796 "1: \n\t"
797 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI
798 "movhps 8(%0, %%"REG_D"), %%xmm0 \n\t" // RXXI
799 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // XXXi
800 "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi
801 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR
802 "movaps "MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm2\n\t"
803 "mulps %%xmm0, %%xmm2 \n\t"
804 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI
805 "mulps "MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
806 "subps %%xmm0, %%xmm2 \n\t"
807 "movzb (%%"REG_a"), %%"REG_d" \n\t"
808 "movzb 1(%%"REG_a"), %%"REG_BP" \n\t"
809 "movlps %%xmm2, (%1, %%"REG_d", 8) \n\t"
810 "movhps %%xmm2, (%1, %%"REG_BP", 8) \n\t"
811 "add $16, %%"REG_S" \n\t"
812 "add $2, %%"REG_a" \n\t" // avoid complex addressing for P4 crap
813 "sub $16, %%"REG_D" \n\t"
814 "jnc 1b \n\t"
815 "pop %%"REG_BP" \n\t"//no we didnt touch ebp *g*
816 :: "r" (data), "r" (buf)
817 : "%"REG_S, "%"REG_D, "%"REG_a, "%"REG_d
821 /* FFT Merge */
822 /* unoptimized variant
823 for (m=1; m < 7; m++) {
824 if(m)
825 two_m = (1 << m);
826 else
827 two_m = 1;
829 two_m_plus_one = (1 << (m+1));
831 for(i = 0; i < 128; i += two_m_plus_one) {
832 for(k = 0; k < two_m; k++) {
833 p = k + i;
834 q = p + two_m;
835 tmp_a_r = buf[p].real;
836 tmp_a_i = buf[p].imag;
837 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
838 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
839 buf[p].real = tmp_a_r + tmp_b_r;
840 buf[p].imag = tmp_a_i + tmp_b_i;
841 buf[q].real = tmp_a_r - tmp_b_r;
842 buf[q].imag = tmp_a_i - tmp_b_i;
848 /* 1. iteration */
849 // Note w[0][0]={1,0}
850 asm volatile(
851 "xorps %%xmm1, %%xmm1 \n\t"
852 "xorps %%xmm2, %%xmm2 \n\t"
853 "mov %0, %%"REG_S" \n\t"
854 ".balign 16 \n\t"
855 "1: \n\t"
856 "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p]
857 "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q]
858 "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p]
859 "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q]
860 "addps %%xmm1, %%xmm0 \n\t"
861 "subps %%xmm2, %%xmm0 \n\t"
862 "movaps %%xmm0, (%%"REG_S")\n\t"
863 "add $16, %%"REG_S" \n\t"
864 "cmp %1, %%"REG_S" \n\t"
865 " jb 1b \n\t"
866 :: "g" (buf), "r" (buf + 128)
867 : "%"REG_S
870 /* 2. iteration */
871 // Note w[1]={{1,0}, {0,-1}}
872 asm volatile(
873 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
874 "mov %0, %%"REG_S" \n\t"
875 ".balign 16 \n\t"
876 "1: \n\t"
877 "movaps 16(%%"REG_S"), %%xmm2 \n\t" //r2,i2,r3,i3
878 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3
879 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3
880 "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1
881 "movaps (%%"REG_S"), %%xmm1 \n\t" //r0,i0,r1,i1
882 "addps %%xmm2, %%xmm0 \n\t"
883 "subps %%xmm2, %%xmm1 \n\t"
884 "movaps %%xmm0, (%%"REG_S") \n\t"
885 "movaps %%xmm1, 16(%%"REG_S") \n\t"
886 "add $32, %%"REG_S" \n\t"
887 "cmp %1, %%"REG_S" \n\t"
888 " jb 1b \n\t"
889 :: "g" (buf), "r" (buf + 128)
890 : "%"REG_S
893 /* 3. iteration */
895 Note sseW2+0={1,1,sqrt(2),sqrt(2))
896 Note sseW2+16={0,0,sqrt(2),-sqrt(2))
897 Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
898 Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
900 asm volatile(
901 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
902 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
903 "xorps %%xmm5, %%xmm5 \n\t"
904 "xorps %%xmm2, %%xmm2 \n\t"
905 "mov %0, %%"REG_S" \n\t"
906 ".balign 16 \n\t"
907 "1: \n\t"
908 "movaps 32(%%"REG_S"), %%xmm2 \n\t" //r4,i4,r5,i5
909 "movaps 48(%%"REG_S"), %%xmm3 \n\t" //r6,i6,r7,i7
910 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
911 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
912 "mulps %%xmm2, %%xmm4 \n\t"
913 "mulps %%xmm3, %%xmm5 \n\t"
914 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5
915 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7
916 "mulps %%xmm6, %%xmm3 \n\t"
917 "mulps %%xmm7, %%xmm2 \n\t"
918 "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1
919 "movaps 16(%%"REG_S"), %%xmm1 \n\t" //r2,i2,r3,i3
920 "addps %%xmm4, %%xmm2 \n\t"
921 "addps %%xmm5, %%xmm3 \n\t"
922 "movaps %%xmm2, %%xmm4 \n\t"
923 "movaps %%xmm3, %%xmm5 \n\t"
924 "addps %%xmm0, %%xmm2 \n\t"
925 "addps %%xmm1, %%xmm3 \n\t"
926 "subps %%xmm4, %%xmm0 \n\t"
927 "subps %%xmm5, %%xmm1 \n\t"
928 "movaps %%xmm2, (%%"REG_S") \n\t"
929 "movaps %%xmm3, 16(%%"REG_S") \n\t"
930 "movaps %%xmm0, 32(%%"REG_S") \n\t"
931 "movaps %%xmm1, 48(%%"REG_S") \n\t"
932 "add $64, %%"REG_S" \n\t"
933 "cmp %1, %%"REG_S" \n\t"
934 " jb 1b \n\t"
935 :: "g" (buf), "r" (buf + 128)
936 : "%"REG_S
939 /* 4-7. iterations */
940 for (m=3; m < 7; m++) {
941 two_m = (1 << m);
942 two_m_plus_one = two_m<<1;
943 two_m_plus_one_shl3 = (two_m_plus_one<<3);
944 buf_offset = buf+128;
945 asm volatile(
946 "mov %0, %%"REG_S" \n\t"
947 ".balign 16 \n\t"
948 "1: \n\t"
949 "xor %%"REG_D", %%"REG_D" \n\t" // k
950 "lea (%%"REG_S", %3), %%"REG_d" \n\t"
951 "2: \n\t"
952 "movaps (%%"REG_d", %%"REG_D"), %%xmm1 \n\t"
953 "movaps (%4, %%"REG_D", 2), %%xmm2 \n\t"
954 "mulps %%xmm1, %%xmm2 \n\t"
955 "shufps $0xB1, %%xmm1, %%xmm1 \n\t"
956 "mulps 16(%4, %%"REG_D", 2), %%xmm1 \n\t"
957 "movaps (%%"REG_S", %%"REG_D"), %%xmm0 \n\t"
958 "addps %%xmm2, %%xmm1 \n\t"
959 "movaps %%xmm1, %%xmm2 \n\t"
960 "addps %%xmm0, %%xmm1 \n\t"
961 "subps %%xmm2, %%xmm0 \n\t"
962 "movaps %%xmm1, (%%"REG_S", %%"REG_D") \n\t"
963 "movaps %%xmm0, (%%"REG_d", %%"REG_D") \n\t"
964 "add $16, %%"REG_D" \n\t"
965 "cmp %3, %%"REG_D" \n\t" //FIXME (opt) count against 0
966 "jb 2b \n\t"
967 "add %2, %%"REG_S" \n\t"
968 "cmp %1, %%"REG_S" \n\t"
969 " jb 1b \n\t"
970 :: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3),
971 "r" (sseW[m])
972 : "%"REG_S, "%"REG_D, "%"REG_d
976 /* Post IFFT complex multiply plus IFFT complex conjugate*/
977 asm volatile(
978 "mov $-1024, %%"REG_S" \n\t"
979 ".balign 16 \n\t"
980 "1: \n\t"
981 "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
982 "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
983 "shufps $0xB1, %%xmm0, %%xmm0 \n\t"
984 "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t"
985 "mulps 1024+"MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
986 "addps %%xmm1, %%xmm0 \n\t"
987 "movaps %%xmm0, (%0, %%"REG_S") \n\t"
988 "add $16, %%"REG_S" \n\t"
989 " jnz 1b \n\t"
990 :: "r" (buf+128)
991 : "%"REG_S
995 data_ptr = data;
996 delay_ptr = delay;
997 window_ptr = imdct_window;
999 /* Window and convert to real valued signal */
1000 asm volatile(
1001 "xor %%"REG_D", %%"REG_D" \n\t" // 0
1002 "xor %%"REG_S", %%"REG_S" \n\t" // 0
1003 "movss %3, %%xmm2 \n\t" // bias
1004 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
1005 ".balign 16 \n\t"
1006 "1: \n\t"
1007 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
1008 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ?
1009 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ?
1010 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ?
1011 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
1012 "mulps "MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1013 "addps (%2, %%"REG_S"), %%xmm0 \n\t"
1014 "addps %%xmm2, %%xmm0 \n\t"
1015 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
1016 "add $16, %%"REG_S" \n\t"
1017 "sub $16, %%"REG_D" \n\t"
1018 "cmp $512, %%"REG_S" \n\t"
1019 " jb 1b \n\t"
1020 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1021 : "%"REG_S, "%"REG_D
1023 data_ptr+=128;
1024 delay_ptr+=128;
1025 // window_ptr+=128;
1027 asm volatile(
1028 "mov $1024, %%"REG_D" \n\t" // 512
1029 "xor %%"REG_S", %%"REG_S" \n\t" // 0
1030 "movss %3, %%xmm2 \n\t" // bias
1031 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
1032 ".balign 16 \n\t"
1033 "1: \n\t"
1034 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A
1035 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C
1036 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C
1037 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A
1038 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
1039 "mulps 512+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1040 "addps (%2, %%"REG_S"), %%xmm0 \n\t"
1041 "addps %%xmm2, %%xmm0 \n\t"
1042 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
1043 "add $16, %%"REG_S" \n\t"
1044 "sub $16, %%"REG_D" \n\t"
1045 "cmp $512, %%"REG_S" \n\t"
1046 " jb 1b \n\t"
1047 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1048 : "%"REG_S, "%"REG_D
1050 data_ptr+=128;
1051 // window_ptr+=128;
1053 /* The trailing edge of the window goes into the delay line */
1054 delay_ptr = delay;
1056 asm volatile(
1057 "xor %%"REG_D", %%"REG_D" \n\t" // 0
1058 "xor %%"REG_S", %%"REG_S" \n\t" // 0
1059 ".balign 16 \n\t"
1060 "1: \n\t"
1061 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A
1062 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C
1063 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C
1064 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A
1065 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
1066 "mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1067 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
1068 "add $16, %%"REG_S" \n\t"
1069 "sub $16, %%"REG_D" \n\t"
1070 "cmp $512, %%"REG_S" \n\t"
1071 " jb 1b \n\t"
1072 :: "r" (buf+64), "r" (delay_ptr)
1073 : "%"REG_S, "%"REG_D
1075 delay_ptr+=128;
1076 // window_ptr-=128;
1078 asm volatile(
1079 "mov $1024, %%"REG_D" \n\t" // 1024
1080 "xor %%"REG_S", %%"REG_S" \n\t" // 0
1081 ".balign 16 \n\t"
1082 "1: \n\t"
1083 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
1084 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ?
1085 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ?
1086 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ?
1087 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
1088 "mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1089 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
1090 "add $16, %%"REG_S" \n\t"
1091 "sub $16, %%"REG_D" \n\t"
1092 "cmp $512, %%"REG_S" \n\t"
1093 " jb 1b \n\t"
1094 :: "r" (buf), "r" (delay_ptr)
1095 : "%"REG_S, "%"REG_D
1098 #endif // ARCH_X86 || ARCH_X86_64
1100 void
1101 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias)
1103 int i,k;
1104 int p,q;
1105 int m;
1106 int two_m;
1107 int two_m_plus_one;
1109 sample_t tmp_a_i;
1110 sample_t tmp_a_r;
1111 sample_t tmp_b_i;
1112 sample_t tmp_b_r;
1114 sample_t *data_ptr;
1115 sample_t *delay_ptr;
1116 sample_t *window_ptr;
1118 complex_t *buf_1, *buf_2;
1120 buf_1 = &buf[0];
1121 buf_2 = &buf[64];
1123 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
1124 for(k=0; k<64; k++) {
1125 /* X1[k] = X[2*k] */
1126 /* X2[k] = X[2*k+1] */
1128 p = 2 * (128-2*k-1);
1129 q = 2 * (2 * k);
1131 /* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
1132 buf_1[k].real = data[p] * xcos2[k] - data[q] * xsin2[k];
1133 buf_1[k].imag = -1.0f * (data[q] * xcos2[k] + data[p] * xsin2[k]);
1134 /* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
1135 buf_2[k].real = data[p + 1] * xcos2[k] - data[q + 1] * xsin2[k];
1136 buf_2[k].imag = -1.0f * ( data[q + 1] * xcos2[k] + data[p + 1] * xsin2[k]);
1139 /* IFFT Bit reversed shuffling */
1140 for(i=0; i<64; i++) {
1141 k = bit_reverse_256[i];
1142 if (k < i) {
1143 swap_cmplx(&buf_1[i],&buf_1[k]);
1144 swap_cmplx(&buf_2[i],&buf_2[k]);
1148 /* FFT Merge */
1149 for (m=0; m < 6; m++) {
1150 two_m = (1 << m);
1151 two_m_plus_one = (1 << (m+1));
1153 /* FIXME */
1154 if(m)
1155 two_m = (1 << m);
1156 else
1157 two_m = 1;
1159 for(k = 0; k < two_m; k++) {
1160 for(i = 0; i < 64; i += two_m_plus_one) {
1161 p = k + i;
1162 q = p + two_m;
1163 /* Do block 1 */
1164 tmp_a_r = buf_1[p].real;
1165 tmp_a_i = buf_1[p].imag;
1166 tmp_b_r = buf_1[q].real * w[m][k].real - buf_1[q].imag * w[m][k].imag;
1167 tmp_b_i = buf_1[q].imag * w[m][k].real + buf_1[q].real * w[m][k].imag;
1168 buf_1[p].real = tmp_a_r + tmp_b_r;
1169 buf_1[p].imag = tmp_a_i + tmp_b_i;
1170 buf_1[q].real = tmp_a_r - tmp_b_r;
1171 buf_1[q].imag = tmp_a_i - tmp_b_i;
1173 /* Do block 2 */
1174 tmp_a_r = buf_2[p].real;
1175 tmp_a_i = buf_2[p].imag;
1176 tmp_b_r = buf_2[q].real * w[m][k].real - buf_2[q].imag * w[m][k].imag;
1177 tmp_b_i = buf_2[q].imag * w[m][k].real + buf_2[q].real * w[m][k].imag;
1178 buf_2[p].real = tmp_a_r + tmp_b_r;
1179 buf_2[p].imag = tmp_a_i + tmp_b_i;
1180 buf_2[q].real = tmp_a_r - tmp_b_r;
1181 buf_2[q].imag = tmp_a_i - tmp_b_i;
1186 /* Post IFFT complex multiply */
1187 for( i=0; i < 64; i++) {
1188 /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
1189 tmp_a_r = buf_1[i].real;
1190 tmp_a_i = -buf_1[i].imag;
1191 buf_1[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
1192 buf_1[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
1193 /* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */
1194 tmp_a_r = buf_2[i].real;
1195 tmp_a_i = -buf_2[i].imag;
1196 buf_2[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
1197 buf_2[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
1200 data_ptr = data;
1201 delay_ptr = delay;
1202 window_ptr = imdct_window;
1204 /* Window and convert to real valued signal */
1205 for(i=0; i< 64; i++) {
1206 *data_ptr++ = -buf_1[i].imag * *window_ptr++ + *delay_ptr++ + bias;
1207 *data_ptr++ = buf_1[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
1210 for(i=0; i< 64; i++) {
1211 *data_ptr++ = -buf_1[i].real * *window_ptr++ + *delay_ptr++ + bias;
1212 *data_ptr++ = buf_1[64-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
1215 delay_ptr = delay;
1217 for(i=0; i< 64; i++) {
1218 *delay_ptr++ = -buf_2[i].real * *--window_ptr;
1219 *delay_ptr++ = buf_2[64-i-1].imag * *--window_ptr;
1222 for(i=0; i< 64; i++) {
1223 *delay_ptr++ = buf_2[i].imag * *--window_ptr;
1224 *delay_ptr++ = -buf_2[64-i-1].real * *--window_ptr;
1228 void imdct_init (uint32_t mm_accel)
1230 #ifdef LIBA52_MLIB
1231 if (mm_accel & MM_ACCEL_MLIB) {
1232 fprintf (stderr, "Using mlib for IMDCT transform\n");
1233 imdct_512 = imdct_do_512_mlib;
1234 imdct_256 = imdct_do_256_mlib;
1235 } else
1236 #endif
1238 int i, j, k;
1240 /* Twiddle factors to turn IFFT into IMDCT */
1241 for (i = 0; i < 128; i++) {
1242 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
1243 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
1245 #if defined(ARCH_X86) || defined(ARCH_X86_64)
1246 for (i = 0; i < 128; i++) {
1247 sseSinCos1c[2*i+0]= xcos1[i];
1248 sseSinCos1c[2*i+1]= -xcos1[i];
1249 sseSinCos1d[2*i+0]= xsin1[i];
1250 sseSinCos1d[2*i+1]= xsin1[i];
1252 #endif
1254 /* More twiddle factors to turn IFFT into IMDCT */
1255 for (i = 0; i < 64; i++) {
1256 xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1));
1257 xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1));
1260 for (i = 0; i < 7; i++) {
1261 j = 1 << i;
1262 for (k = 0; k < j; k++) {
1263 w[i][k].real = cos (-M_PI * k / j);
1264 w[i][k].imag = sin (-M_PI * k / j);
1267 #if defined(ARCH_X86) || defined(ARCH_X86_64)
1268 for (i = 1; i < 7; i++) {
1269 j = 1 << i;
1270 for (k = 0; k < j; k+=2) {
1272 sseW[i][4*k + 0] = w[i][k+0].real;
1273 sseW[i][4*k + 1] = w[i][k+0].real;
1274 sseW[i][4*k + 2] = w[i][k+1].real;
1275 sseW[i][4*k + 3] = w[i][k+1].real;
1277 sseW[i][4*k + 4] = -w[i][k+0].imag;
1278 sseW[i][4*k + 5] = w[i][k+0].imag;
1279 sseW[i][4*k + 6] = -w[i][k+1].imag;
1280 sseW[i][4*k + 7] = w[i][k+1].imag;
1282 //we multiply more or less uninitalized numbers so we need to use exactly 0.0
1283 if(k==0)
1285 // sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
1286 sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
1289 if(2*k == j)
1291 sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
1292 // sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
1297 for(i=0; i<128; i++)
1299 sseWindow[2*i+0]= -imdct_window[2*i+0];
1300 sseWindow[2*i+1]= imdct_window[2*i+1];
1303 for(i=0; i<64; i++)
1305 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1];
1306 sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0];
1307 sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1];
1308 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0];
1310 #endif // ARCH_X86 || ARCH_X86_64
1312 imdct_512 = imdct_do_512;
1313 #if defined(ARCH_X86) || defined(ARCH_X86_64)
1314 if(mm_accel & MM_ACCEL_X86_SSE)
1316 fprintf (stderr, "Using SSE optimized IMDCT transform\n");
1317 imdct_512 = imdct_do_512_sse;
1319 else
1320 if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
1322 fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
1323 imdct_512 = imdct_do_512_3dnowex;
1325 else
1326 if(mm_accel & MM_ACCEL_X86_3DNOW)
1328 fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
1329 imdct_512 = imdct_do_512_3dnow;
1331 else
1332 #endif // ARCH_X86 || ARCH_X86_64
1333 #ifdef HAVE_ALTIVEC
1334 if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
1336 fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
1337 imdct_512 = imdct_do_512_altivec;
1339 else
1340 #endif
1341 fprintf (stderr, "No accelerated IMDCT transform found\n");
1342 imdct_256 = imdct_do_256;
1346 static void fft_asmb(int k, complex_t *x, complex_t *wTB,
1347 const complex_t *d, const complex_t *d_3)
1349 register complex_t *x2k, *x3k, *x4k, *wB;
1350 register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
1352 x2k = x + 2 * k;
1353 x3k = x2k + 2 * k;
1354 x4k = x3k + 2 * k;
1355 wB = wTB + 2 * k;
1357 TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
1358 TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
1360 --k;
1361 for(;;) {
1362 TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]);
1363 TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]);
1364 if (!--k) break;
1365 x += 2;
1366 x2k += 2;
1367 x3k += 2;
1368 x4k += 2;
1369 d += 2;
1370 d_3 += 2;
1371 wTB += 2;
1372 wB += 2;
1377 static void fft_asmb16(complex_t *x, complex_t *wTB)
1379 register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
1380 int k = 2;
1382 /* transform x[0], x[8], x[4], x[12] */
1383 TRANSZERO(x[0],x[4],x[8],x[12]);
1385 /* transform x[1], x[9], x[5], x[13] */
1386 TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]);
1388 /* transform x[2], x[10], x[6], x[14] */
1389 TRANSHALF_16(x[2],x[6],x[10],x[14]);
1391 /* transform x[3], x[11], x[7], x[15] */
1392 TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]);
1396 static void fft_4(complex_t *x)
1398 /* delta_p = 1 here */
1399 /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
1402 register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
1404 yt_r = x[0].real;
1405 yb_r = yt_r - x[2].real;
1406 yt_r += x[2].real;
1408 u_r = x[1].real;
1409 vi_i = x[3].real - u_r;
1410 u_r += x[3].real;
1412 u_i = x[1].imag;
1413 vi_r = u_i - x[3].imag;
1414 u_i += x[3].imag;
1416 yt_i = yt_r;
1417 yt_i += u_r;
1418 x[0].real = yt_i;
1419 yt_r -= u_r;
1420 x[2].real = yt_r;
1421 yt_i = yb_r;
1422 yt_i += vi_r;
1423 x[1].real = yt_i;
1424 yb_r -= vi_r;
1425 x[3].real = yb_r;
1427 yt_i = x[0].imag;
1428 yb_i = yt_i - x[2].imag;
1429 yt_i += x[2].imag;
1431 yt_r = yt_i;
1432 yt_r += u_i;
1433 x[0].imag = yt_r;
1434 yt_i -= u_i;
1435 x[2].imag = yt_i;
1436 yt_r = yb_i;
1437 yt_r += vi_i;
1438 x[1].imag = yt_r;
1439 yb_i -= vi_i;
1440 x[3].imag = yb_i;
1444 static void fft_8(complex_t *x)
1446 /* delta_p = diag{1, sqrt(i)} here */
1447 /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8}
1449 register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
1451 wT1_r = x[1].real;
1452 wT1_i = x[1].imag;
1453 wB1_r = x[3].real;
1454 wB1_i = x[3].imag;
1456 x[1] = x[2];
1457 x[2] = x[4];
1458 x[3] = x[6];
1459 fft_4(&x[0]);
1462 /* x[0] x[4] */
1463 wT2_r = x[5].real;
1464 wT2_r += x[7].real;
1465 wT2_r += wT1_r;
1466 wT2_r += wB1_r;
1467 wT2_i = wT2_r;
1468 wT2_r += x[0].real;
1469 wT2_i = x[0].real - wT2_i;
1470 x[0].real = wT2_r;
1471 x[4].real = wT2_i;
1473 wT2_i = x[5].imag;
1474 wT2_i += x[7].imag;
1475 wT2_i += wT1_i;
1476 wT2_i += wB1_i;
1477 wT2_r = wT2_i;
1478 wT2_r += x[0].imag;
1479 wT2_i = x[0].imag - wT2_i;
1480 x[0].imag = wT2_r;
1481 x[4].imag = wT2_i;
1483 /* x[2] x[6] */
1484 wT2_r = x[5].imag;
1485 wT2_r -= x[7].imag;
1486 wT2_r += wT1_i;
1487 wT2_r -= wB1_i;
1488 wT2_i = wT2_r;
1489 wT2_r += x[2].real;
1490 wT2_i = x[2].real - wT2_i;
1491 x[2].real = wT2_r;
1492 x[6].real = wT2_i;
1494 wT2_i = x[5].real;
1495 wT2_i -= x[7].real;
1496 wT2_i += wT1_r;
1497 wT2_i -= wB1_r;
1498 wT2_r = wT2_i;
1499 wT2_r += x[2].imag;
1500 wT2_i = x[2].imag - wT2_i;
1501 x[2].imag = wT2_i;
1502 x[6].imag = wT2_r;
1505 /* x[1] x[5] */
1506 wT2_r = wT1_r;
1507 wT2_r += wB1_i;
1508 wT2_r -= x[5].real;
1509 wT2_r -= x[7].imag;
1510 wT2_i = wT1_i;
1511 wT2_i -= wB1_r;
1512 wT2_i -= x[5].imag;
1513 wT2_i += x[7].real;
1515 wB2_r = wT2_r;
1516 wB2_r += wT2_i;
1517 wT2_i -= wT2_r;
1518 wB2_r *= HSQRT2;
1519 wT2_i *= HSQRT2;
1520 wT2_r = wB2_r;
1521 wB2_r += x[1].real;
1522 wT2_r = x[1].real - wT2_r;
1524 wB2_i = x[5].real;
1525 x[1].real = wB2_r;
1526 x[5].real = wT2_r;
1528 wT2_r = wT2_i;
1529 wT2_r += x[1].imag;
1530 wT2_i = x[1].imag - wT2_i;
1531 wB2_r = x[5].imag;
1532 x[1].imag = wT2_r;
1533 x[5].imag = wT2_i;
1535 /* x[3] x[7] */
1536 wT1_r -= wB1_i;
1537 wT1_i += wB1_r;
1538 wB1_r = wB2_i - x[7].imag;
1539 wB1_i = wB2_r + x[7].real;
1540 wT1_r -= wB1_r;
1541 wT1_i -= wB1_i;
1542 wB1_r = wT1_r + wT1_i;
1543 wB1_r *= HSQRT2;
1544 wT1_i -= wT1_r;
1545 wT1_i *= HSQRT2;
1546 wB2_r = x[3].real;
1547 wB2_i = wB2_r + wT1_i;
1548 wB2_r -= wT1_i;
1549 x[3].real = wB2_i;
1550 x[7].real = wB2_r;
1551 wB2_i = x[3].imag;
1552 wB2_r = wB2_i + wB1_r;
1553 wB2_i -= wB1_r;
1554 x[3].imag = wB2_i;
1555 x[7].imag = wB2_r;
1559 static void fft_128p(complex_t *a)
1561 fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]);
1562 fft_asmb16(&a[0], &a[8]);
1564 fft_8(&a[16]), fft_8(&a[24]);
1565 fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
1567 fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]);
1568 fft_asmb16(&a[32], &a[40]);
1570 fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]);
1571 fft_asmb16(&a[48], &a[56]);
1573 fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
1575 fft_8(&a[64]); fft_4(&a[72]); fft_4(&a[76]);
1576 /* fft_16(&a[64]); */
1577 fft_asmb16(&a[64], &a[72]);
1579 fft_8(&a[80]); fft_8(&a[88]);
1581 /* fft_32(&a[64]); */
1582 fft_asmb(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
1584 fft_8(&a[96]); fft_4(&a[104]), fft_4(&a[108]);
1585 /* fft_16(&a[96]); */
1586 fft_asmb16(&a[96], &a[104]);
1588 fft_8(&a[112]), fft_8(&a[120]);
1589 /* fft_32(&a[96]); */
1590 fft_asmb(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
1592 /* fft_128(&a[0]); */
1593 fft_asmb(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);