Fix vf_tcdump's compilation
[mplayer/kovensky.git] / liba52 / imdct.c
blob766cacb422de2d4014061fd7ebe7272eb9440ab1
1 /*
2 * imdct.c
3 * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
6 * The ifft algorithms in this file have been largely inspired by Dan
7 * Bernstein's work, djbfft, available at http://cr.yp.to/djbfft.html
9 * This file is part of a52dec, a free ATSC A-52 stream decoder.
10 * See http://liba52.sourceforge.net/ for updates.
12 * Modified for use with MPlayer, changes contained in liba52_changes.diff.
13 * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/
14 * $Id$
16 * a52dec is free software; you can redistribute it and/or modify
17 * it under the terms of the GNU General Public License as published by
18 * the Free Software Foundation; either version 2 of the License, or
19 * (at your option) any later version.
21 * a52dec is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU General Public License for more details.
26 * You should have received a copy of the GNU General Public License
27 * along with this program; if not, write to the Free Software
28 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
31 * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
32 * michael did port them from libac3 (untested, perhaps totally broken)
33 * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
36 #include "config.h"
38 #include <math.h>
39 #include <stdio.h>
40 #ifdef LIBA52_DJBFFT
41 #include <fftc4.h>
42 #endif
43 #ifndef M_PI
44 #define M_PI 3.1415926535897932384626433832795029
45 #endif
46 #include <inttypes.h>
48 #include "a52.h"
49 #include "a52_internal.h"
50 #include "mm_accel.h"
51 #include "mangle.h"
53 void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias);
55 #if CONFIG_RUNTIME_CPUDETECT
56 #undef HAVE_AMD3DNOWEXT
57 #define HAVE_AMD3DNOWEXT 0
58 #endif
60 typedef struct complex_s {
61 sample_t real;
62 sample_t imag;
63 } complex_t;
65 static const int pm128[128] attribute_used __attribute__((aligned(16))) =
67 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
68 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,
69 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,
70 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,
71 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,
72 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
73 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
74 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
77 static uint8_t attribute_used bit_reverse_512[] = {
78 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
79 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
80 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
81 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
82 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
83 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
84 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
85 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
86 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
87 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
88 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
89 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
90 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
91 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
92 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
93 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
95 static uint8_t fftorder[] = {
96 0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176,
97 8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88,
98 4,132, 68,196, 36,164,228,100, 20,148, 84,212,244,116, 52,180,
99 252,124, 60,188, 28,156,220, 92, 12,140, 76,204,236,108, 44,172,
100 2,130, 66,194, 34,162,226, 98, 18,146, 82,210,242,114, 50,178,
101 10,138, 74,202, 42,170,234,106,250,122, 58,186, 26,154,218, 90,
102 254,126, 62,190, 30,158,222, 94, 14,142, 78,206,238,110, 46,174,
103 6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86
106 static complex_t __attribute__((aligned(16))) buf[128];
108 /* Twiddle factor LUT */
109 static complex_t __attribute__((aligned(16))) w_1[1];
110 static complex_t __attribute__((aligned(16))) w_2[2];
111 static complex_t __attribute__((aligned(16))) w_4[4];
112 static complex_t __attribute__((aligned(16))) w_8[8];
113 static complex_t __attribute__((aligned(16))) w_16[16];
114 static complex_t __attribute__((aligned(16))) w_32[32];
115 static complex_t __attribute__((aligned(16))) w_64[64];
116 static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
118 /* Twiddle factors for IMDCT */
119 static sample_t __attribute__((aligned(16))) xcos1[128];
120 static sample_t __attribute__((aligned(16))) xsin1[128];
122 #if ARCH_X86 || ARCH_X86_64
123 // NOTE: SSE needs 16byte alignment or it will segfault
125 static float __attribute__((aligned(16))) sseSinCos1c[256];
126 static float __attribute__((aligned(16))) sseSinCos1d[256];
127 static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
128 //static float __attribute__((aligned(16))) sseW0[4];
129 static float __attribute__((aligned(16))) sseW1[8];
130 static float __attribute__((aligned(16))) sseW2[16];
131 static float __attribute__((aligned(16))) sseW3[32];
132 static float __attribute__((aligned(16))) sseW4[64];
133 static float __attribute__((aligned(16))) sseW5[128];
134 static float __attribute__((aligned(16))) sseW6[256];
135 static float __attribute__((aligned(16))) *sseW[7]=
136 {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
137 static float __attribute__((aligned(16))) sseWindow[512];
138 #endif
140 /* Root values for IFFT */
141 static sample_t roots16[3];
142 static sample_t roots32[7];
143 static sample_t roots64[15];
144 static sample_t roots128[31];
146 /* Twiddle factors for IMDCT */
147 static complex_t pre1[128];
148 static complex_t post1[64];
149 static complex_t pre2[64];
150 static complex_t post2[32];
152 static sample_t a52_imdct_window[256];
154 static void (* ifft128) (complex_t * buf);
155 static void (* ifft64) (complex_t * buf);
157 static inline void ifft2 (complex_t * buf)
159 double r, i;
161 r = buf[0].real;
162 i = buf[0].imag;
163 buf[0].real += buf[1].real;
164 buf[0].imag += buf[1].imag;
165 buf[1].real = r - buf[1].real;
166 buf[1].imag = i - buf[1].imag;
169 static inline void ifft4 (complex_t * buf)
171 double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
173 tmp1 = buf[0].real + buf[1].real;
174 tmp2 = buf[3].real + buf[2].real;
175 tmp3 = buf[0].imag + buf[1].imag;
176 tmp4 = buf[2].imag + buf[3].imag;
177 tmp5 = buf[0].real - buf[1].real;
178 tmp6 = buf[0].imag - buf[1].imag;
179 tmp7 = buf[2].imag - buf[3].imag;
180 tmp8 = buf[3].real - buf[2].real;
182 buf[0].real = tmp1 + tmp2;
183 buf[0].imag = tmp3 + tmp4;
184 buf[2].real = tmp1 - tmp2;
185 buf[2].imag = tmp3 - tmp4;
186 buf[1].real = tmp5 + tmp7;
187 buf[1].imag = tmp6 + tmp8;
188 buf[3].real = tmp5 - tmp7;
189 buf[3].imag = tmp6 - tmp8;
192 /* the basic split-radix ifft butterfly */
194 #define BUTTERFLY(a0,a1,a2,a3,wr,wi) do { \
195 tmp5 = a2.real * wr + a2.imag * wi; \
196 tmp6 = a2.imag * wr - a2.real * wi; \
197 tmp7 = a3.real * wr - a3.imag * wi; \
198 tmp8 = a3.imag * wr + a3.real * wi; \
199 tmp1 = tmp5 + tmp7; \
200 tmp2 = tmp6 + tmp8; \
201 tmp3 = tmp6 - tmp8; \
202 tmp4 = tmp7 - tmp5; \
203 a2.real = a0.real - tmp1; \
204 a2.imag = a0.imag - tmp2; \
205 a3.real = a1.real - tmp3; \
206 a3.imag = a1.imag - tmp4; \
207 a0.real += tmp1; \
208 a0.imag += tmp2; \
209 a1.real += tmp3; \
210 a1.imag += tmp4; \
211 } while (0)
213 /* split-radix ifft butterfly, specialized for wr=1 wi=0 */
215 #define BUTTERFLY_ZERO(a0,a1,a2,a3) do { \
216 tmp1 = a2.real + a3.real; \
217 tmp2 = a2.imag + a3.imag; \
218 tmp3 = a2.imag - a3.imag; \
219 tmp4 = a3.real - a2.real; \
220 a2.real = a0.real - tmp1; \
221 a2.imag = a0.imag - tmp2; \
222 a3.real = a1.real - tmp3; \
223 a3.imag = a1.imag - tmp4; \
224 a0.real += tmp1; \
225 a0.imag += tmp2; \
226 a1.real += tmp3; \
227 a1.imag += tmp4; \
228 } while (0)
230 /* split-radix ifft butterfly, specialized for wr=wi */
232 #define BUTTERFLY_HALF(a0,a1,a2,a3,w) do { \
233 tmp5 = (a2.real + a2.imag) * w; \
234 tmp6 = (a2.imag - a2.real) * w; \
235 tmp7 = (a3.real - a3.imag) * w; \
236 tmp8 = (a3.imag + a3.real) * w; \
237 tmp1 = tmp5 + tmp7; \
238 tmp2 = tmp6 + tmp8; \
239 tmp3 = tmp6 - tmp8; \
240 tmp4 = tmp7 - tmp5; \
241 a2.real = a0.real - tmp1; \
242 a2.imag = a0.imag - tmp2; \
243 a3.real = a1.real - tmp3; \
244 a3.imag = a1.imag - tmp4; \
245 a0.real += tmp1; \
246 a0.imag += tmp2; \
247 a1.real += tmp3; \
248 a1.imag += tmp4; \
249 } while (0)
251 static inline void ifft8 (complex_t * buf)
253 double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
255 ifft4 (buf);
256 ifft2 (buf + 4);
257 ifft2 (buf + 6);
258 BUTTERFLY_ZERO (buf[0], buf[2], buf[4], buf[6]);
259 BUTTERFLY_HALF (buf[1], buf[3], buf[5], buf[7], roots16[1]);
262 static void ifft_pass (complex_t * buf, sample_t * weight, int n)
264 complex_t * buf1;
265 complex_t * buf2;
266 complex_t * buf3;
267 double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
268 int i;
270 buf++;
271 buf1 = buf + n;
272 buf2 = buf + 2 * n;
273 buf3 = buf + 3 * n;
275 BUTTERFLY_ZERO (buf[-1], buf1[-1], buf2[-1], buf3[-1]);
277 i = n - 1;
279 do {
280 BUTTERFLY (buf[0], buf1[0], buf2[0], buf3[0], weight[n], weight[2*i]);
281 buf++;
282 buf1++;
283 buf2++;
284 buf3++;
285 weight++;
286 } while (--i);
289 static void ifft16 (complex_t * buf)
291 ifft8 (buf);
292 ifft4 (buf + 8);
293 ifft4 (buf + 12);
294 ifft_pass (buf, roots16 - 4, 4);
297 static void ifft32 (complex_t * buf)
299 ifft16 (buf);
300 ifft8 (buf + 16);
301 ifft8 (buf + 24);
302 ifft_pass (buf, roots32 - 8, 8);
305 static void ifft64_c (complex_t * buf)
307 ifft32 (buf);
308 ifft16 (buf + 32);
309 ifft16 (buf + 48);
310 ifft_pass (buf, roots64 - 16, 16);
313 static void ifft128_c (complex_t * buf)
315 ifft32 (buf);
316 ifft16 (buf + 32);
317 ifft16 (buf + 48);
318 ifft_pass (buf, roots64 - 16, 16);
320 ifft32 (buf + 64);
321 ifft32 (buf + 96);
322 ifft_pass (buf, roots128 - 32, 32);
325 void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias)
327 int i, k;
328 sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2;
329 const sample_t * window = a52_imdct_window;
330 complex_t buf[128];
332 for (i = 0; i < 128; i++) {
333 k = fftorder[i];
334 t_r = pre1[i].real;
335 t_i = pre1[i].imag;
337 buf[i].real = t_i * data[255-k] + t_r * data[k];
338 buf[i].imag = t_r * data[255-k] - t_i * data[k];
341 ifft128 (buf);
343 /* Post IFFT complex multiply plus IFFT complex conjugate*/
344 /* Window and convert to real valued signal */
345 for (i = 0; i < 64; i++) {
346 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
347 t_r = post1[i].real;
348 t_i = post1[i].imag;
350 a_r = t_r * buf[i].real + t_i * buf[i].imag;
351 a_i = t_i * buf[i].real - t_r * buf[i].imag;
352 b_r = t_i * buf[127-i].real + t_r * buf[127-i].imag;
353 b_i = t_r * buf[127-i].real - t_i * buf[127-i].imag;
355 w_1 = window[2*i];
356 w_2 = window[255-2*i];
357 data[2*i] = delay[2*i] * w_2 - a_r * w_1 + bias;
358 data[255-2*i] = delay[2*i] * w_1 + a_r * w_2 + bias;
359 delay[2*i] = a_i;
361 w_1 = window[2*i+1];
362 w_2 = window[254-2*i];
363 data[2*i+1] = delay[2*i+1] * w_2 + b_r * w_1 + bias;
364 data[254-2*i] = delay[2*i+1] * w_1 - b_r * w_2 + bias;
365 delay[2*i+1] = b_i;
369 #if HAVE_ALTIVEC
371 #ifdef HAVE_ALTIVEC_H
372 #include <altivec.h>
373 #endif
375 // used to build registers permutation vectors (vcprm)
376 // the 's' are for words in the _s_econd vector
377 #define WORD_0 0x00,0x01,0x02,0x03
378 #define WORD_1 0x04,0x05,0x06,0x07
379 #define WORD_2 0x08,0x09,0x0a,0x0b
380 #define WORD_3 0x0c,0x0d,0x0e,0x0f
381 #define WORD_s0 0x10,0x11,0x12,0x13
382 #define WORD_s1 0x14,0x15,0x16,0x17
383 #define WORD_s2 0x18,0x19,0x1a,0x1b
384 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
386 #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
387 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
389 #define FOUROF(a) {a,a,a,a}
391 // vcprmle is used to keep the same index as in the SSE version.
392 // it's the same as vcprm, with the index inversed
393 // ('le' is Little Endian)
394 #define vcprmle(a,b,c,d) vcprm(d,c,b,a)
396 // used to build inverse/identity vectors (vcii)
397 // n is _n_egative, p is _p_ositive
398 #define FLOAT_n -1.
399 #define FLOAT_p 1.
402 void
403 imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
405 int i;
406 int k;
407 int p,q;
408 int m;
409 long two_m;
410 long two_m_plus_one;
412 sample_t tmp_b_i;
413 sample_t tmp_b_r;
414 sample_t tmp_a_i;
415 sample_t tmp_a_r;
417 sample_t *data_ptr;
418 sample_t *delay_ptr;
419 sample_t *window_ptr;
421 /* 512 IMDCT with source and dest data in 'data' */
423 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
424 for( i=0; i < 128; i++) {
425 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
426 int j= bit_reverse_512[i];
427 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
428 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
431 /* 1. iteration */
432 for(i = 0; i < 128; i += 2) {
433 #if 0
434 tmp_a_r = buf[i].real;
435 tmp_a_i = buf[i].imag;
436 tmp_b_r = buf[i+1].real;
437 tmp_b_i = buf[i+1].imag;
438 buf[i].real = tmp_a_r + tmp_b_r;
439 buf[i].imag = tmp_a_i + tmp_b_i;
440 buf[i+1].real = tmp_a_r - tmp_b_r;
441 buf[i+1].imag = tmp_a_i - tmp_b_i;
442 #else
443 vector float temp, bufv;
445 bufv = vec_ld(i << 3, (float*)buf);
446 temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
447 bufv = vec_madd(bufv, vcii(p,p,n,n), temp);
448 vec_st(bufv, i << 3, (float*)buf);
449 #endif
452 /* 2. iteration */
453 // Note w[1]={{1,0}, {0,-1}}
454 for(i = 0; i < 128; i += 4) {
455 #if 0
456 tmp_a_r = buf[i].real;
457 tmp_a_i = buf[i].imag;
458 tmp_b_r = buf[i+2].real;
459 tmp_b_i = buf[i+2].imag;
460 buf[i].real = tmp_a_r + tmp_b_r;
461 buf[i].imag = tmp_a_i + tmp_b_i;
462 buf[i+2].real = tmp_a_r - tmp_b_r;
463 buf[i+2].imag = tmp_a_i - tmp_b_i;
464 tmp_a_r = buf[i+1].real;
465 tmp_a_i = buf[i+1].imag;
466 /* WARNING: im <-> re here ! */
467 tmp_b_r = buf[i+3].imag;
468 tmp_b_i = buf[i+3].real;
469 buf[i+1].real = tmp_a_r + tmp_b_r;
470 buf[i+1].imag = tmp_a_i - tmp_b_i;
471 buf[i+3].real = tmp_a_r - tmp_b_r;
472 buf[i+3].imag = tmp_a_i + tmp_b_i;
473 #else
474 vector float buf01, buf23, temp1, temp2;
476 buf01 = vec_ld((i + 0) << 3, (float*)buf);
477 buf23 = vec_ld((i + 2) << 3, (float*)buf);
478 buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
480 temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);
481 temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);
483 vec_st(temp1, (i + 0) << 3, (float*)buf);
484 vec_st(temp2, (i + 2) << 3, (float*)buf);
485 #endif
488 /* 3. iteration */
489 for(i = 0; i < 128; i += 8) {
490 #if 0
491 tmp_a_r = buf[i].real;
492 tmp_a_i = buf[i].imag;
493 tmp_b_r = buf[i+4].real;
494 tmp_b_i = buf[i+4].imag;
495 buf[i].real = tmp_a_r + tmp_b_r;
496 buf[i].imag = tmp_a_i + tmp_b_i;
497 buf[i+4].real = tmp_a_r - tmp_b_r;
498 buf[i+4].imag = tmp_a_i - tmp_b_i;
499 tmp_a_r = buf[1+i].real;
500 tmp_a_i = buf[1+i].imag;
501 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
502 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
503 buf[1+i].real = tmp_a_r + tmp_b_r;
504 buf[1+i].imag = tmp_a_i + tmp_b_i;
505 buf[i+5].real = tmp_a_r - tmp_b_r;
506 buf[i+5].imag = tmp_a_i - tmp_b_i;
507 tmp_a_r = buf[i+2].real;
508 tmp_a_i = buf[i+2].imag;
509 /* WARNING re <-> im & sign */
510 tmp_b_r = buf[i+6].imag;
511 tmp_b_i = - buf[i+6].real;
512 buf[i+2].real = tmp_a_r + tmp_b_r;
513 buf[i+2].imag = tmp_a_i + tmp_b_i;
514 buf[i+6].real = tmp_a_r - tmp_b_r;
515 buf[i+6].imag = tmp_a_i - tmp_b_i;
516 tmp_a_r = buf[i+3].real;
517 tmp_a_i = buf[i+3].imag;
518 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
519 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
520 buf[i+3].real = tmp_a_r + tmp_b_r;
521 buf[i+3].imag = tmp_a_i + tmp_b_i;
522 buf[i+7].real = tmp_a_r - tmp_b_r;
523 buf[i+7].imag = tmp_a_i - tmp_b_i;
524 #else
525 vector float buf01, buf23, buf45, buf67;
527 buf01 = vec_ld((i + 0) << 3, (float*)buf);
528 buf23 = vec_ld((i + 2) << 3, (float*)buf);
530 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
531 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
532 buf[i+5].real = tmp_b_r;
533 buf[i+5].imag = tmp_b_i;
534 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
535 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
536 buf[i+7].real = tmp_b_r;
537 buf[i+7].imag = tmp_b_i;
539 buf23 = vec_ld((i + 2) << 3, (float*)buf);
540 buf45 = vec_ld((i + 4) << 3, (float*)buf);
541 buf67 = vec_ld((i + 6) << 3, (float*)buf);
542 buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
544 vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
545 vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
546 vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
547 vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
548 #endif
551 /* 4-7. iterations */
552 for (m=3; m < 7; m++) {
553 two_m = (1 << m);
555 two_m_plus_one = two_m<<1;
557 for(i = 0; i < 128; i += two_m_plus_one) {
558 for(k = 0; k < two_m; k+=2) {
559 #if 0
560 int p = k + i;
561 int q = p + two_m;
562 tmp_a_r = buf[p].real;
563 tmp_a_i = buf[p].imag;
564 tmp_b_r =
565 buf[q].real * w[m][k].real -
566 buf[q].imag * w[m][k].imag;
567 tmp_b_i =
568 buf[q].imag * w[m][k].real +
569 buf[q].real * w[m][k].imag;
570 buf[p].real = tmp_a_r + tmp_b_r;
571 buf[p].imag = tmp_a_i + tmp_b_i;
572 buf[q].real = tmp_a_r - tmp_b_r;
573 buf[q].imag = tmp_a_i - tmp_b_i;
575 tmp_a_r = buf[(p + 1)].real;
576 tmp_a_i = buf[(p + 1)].imag;
577 tmp_b_r =
578 buf[(q + 1)].real * w[m][(k + 1)].real -
579 buf[(q + 1)].imag * w[m][(k + 1)].imag;
580 tmp_b_i =
581 buf[(q + 1)].imag * w[m][(k + 1)].real +
582 buf[(q + 1)].real * w[m][(k + 1)].imag;
583 buf[(p + 1)].real = tmp_a_r + tmp_b_r;
584 buf[(p + 1)].imag = tmp_a_i + tmp_b_i;
585 buf[(q + 1)].real = tmp_a_r - tmp_b_r;
586 buf[(q + 1)].imag = tmp_a_i - tmp_b_i;
587 #else
588 int p = k + i;
589 int q = p + two_m;
590 vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;
591 const vector float vczero = (const vector float)FOUROF(0.);
592 // first compute buf[q] and buf[q+1]
593 vecq = vec_ld(q << 3, (float*)buf);
594 vecw = vec_ld(0, (float*)&(w[m][k]));
595 temp1 = vec_madd(vecq, vecw, vczero);
596 temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));
597 temp2 = vec_madd(temp2, vecw, vczero);
598 temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));
599 temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));
600 vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
601 // then butterfly with buf[p] and buf[p+1]
602 vecp = vec_ld(p << 3, (float*)buf);
604 temp1 = vec_add(vecp, vecq);
605 temp2 = vec_sub(vecp, vecq);
607 vec_st(temp1, p << 3, (float*)buf);
608 vec_st(temp2, q << 3, (float*)buf);
609 #endif
614 /* Post IFFT complex multiply plus IFFT complex conjugate*/
615 for( i=0; i < 128; i+=4) {
616 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
617 #if 0
618 tmp_a_r = buf[(i + 0)].real;
619 tmp_a_i = -1.0 * buf[(i + 0)].imag;
620 buf[(i + 0)].real =
621 (tmp_a_r * xcos1[(i + 0)]) - (tmp_a_i * xsin1[(i + 0)]);
622 buf[(i + 0)].imag =
623 (tmp_a_r * xsin1[(i + 0)]) + (tmp_a_i * xcos1[(i + 0)]);
625 tmp_a_r = buf[(i + 1)].real;
626 tmp_a_i = -1.0 * buf[(i + 1)].imag;
627 buf[(i + 1)].real =
628 (tmp_a_r * xcos1[(i + 1)]) - (tmp_a_i * xsin1[(i + 1)]);
629 buf[(i + 1)].imag =
630 (tmp_a_r * xsin1[(i + 1)]) + (tmp_a_i * xcos1[(i + 1)]);
632 tmp_a_r = buf[(i + 2)].real;
633 tmp_a_i = -1.0 * buf[(i + 2)].imag;
634 buf[(i + 2)].real =
635 (tmp_a_r * xcos1[(i + 2)]) - (tmp_a_i * xsin1[(i + 2)]);
636 buf[(i + 2)].imag =
637 (tmp_a_r * xsin1[(i + 2)]) + (tmp_a_i * xcos1[(i + 2)]);
639 tmp_a_r = buf[(i + 3)].real;
640 tmp_a_i = -1.0 * buf[(i + 3)].imag;
641 buf[(i + 3)].real =
642 (tmp_a_r * xcos1[(i + 3)]) - (tmp_a_i * xsin1[(i + 3)]);
643 buf[(i + 3)].imag =
644 (tmp_a_r * xsin1[(i + 3)]) + (tmp_a_i * xcos1[(i + 3)]);
645 #else
646 vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;
647 vector float temp0022, temp1133, tempCS01;
648 const vector float vczero = (const vector float)FOUROF(0.);
650 bufv_0 = vec_ld((i + 0) << 3, (float*)buf);
651 bufv_2 = vec_ld((i + 2) << 3, (float*)buf);
653 cosv = vec_ld(i << 2, xcos1);
654 sinv = vec_ld(i << 2, xsin1);
656 temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));
657 temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));
658 tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));
659 temp1 = vec_madd(temp0022, tempCS01, vczero);
660 tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
661 temp2 = vec_madd(temp1133, tempCS01, vczero);
662 bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
664 vec_st(bufv_0, (i + 0) << 3, (float*)buf);
666 /* idem with bufv_2 and high-order cosv/sinv */
668 temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));
669 temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));
670 tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));
671 temp1 = vec_madd(temp0022, tempCS01, vczero);
672 tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));
673 temp2 = vec_madd(temp1133, tempCS01, vczero);
674 bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
676 vec_st(bufv_2, (i + 2) << 3, (float*)buf);
678 #endif
681 data_ptr = data;
682 delay_ptr = delay;
683 window_ptr = a52_imdct_window;
685 /* Window and convert to real valued signal */
686 for(i=0; i< 64; i++) {
687 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
688 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
691 for(i=0; i< 64; i++) {
692 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
693 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
696 /* The trailing edge of the window goes into the delay line */
697 delay_ptr = delay;
699 for(i=0; i< 64; i++) {
700 *delay_ptr++ = -buf[64+i].real * *--window_ptr;
701 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
704 for(i=0; i<64; i++) {
705 *delay_ptr++ = buf[i].imag * *--window_ptr;
706 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
709 #endif
712 // Stuff below this line is borrowed from libac3
713 #include "srfftp.h"
714 #if ARCH_X86 || ARCH_X86_64
715 #undef HAVE_AMD3DNOW
716 #define HAVE_AMD3DNOW 1
717 #include "srfftp_3dnow.h"
719 const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
720 const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
721 const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
723 #undef HAVE_AMD3DNOWEXT
724 #define HAVE_AMD3DNOWEXT 0
725 #include "imdct_3dnow.h"
726 #undef HAVE_AMD3DNOWEXT
727 #define HAVE_AMD3DNOWEXT 1
728 #include "imdct_3dnow.h"
730 #if !ARCH_X86_64 || !defined(PIC)
731 void
732 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
734 /* int i,k;
735 int p,q;*/
736 int m;
737 long two_m;
738 long two_m_plus_one;
739 long two_m_plus_one_shl3;
740 complex_t *buf_offset;
742 /* sample_t tmp_a_i;
743 sample_t tmp_a_r;
744 sample_t tmp_b_i;
745 sample_t tmp_b_r;*/
747 sample_t *data_ptr;
748 sample_t *delay_ptr;
749 sample_t *window_ptr;
751 /* 512 IMDCT with source and dest data in 'data' */
752 /* see the c version (dct_do_512()), its allmost identical, just in C */
754 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
755 /* Bit reversed shuffling */
756 __asm__ volatile(
757 "xor %%"REG_S", %%"REG_S" \n\t"
758 "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t"
759 "mov $1008, %%"REG_D" \n\t"
760 "push %%"REG_BP" \n\t" //use ebp without telling gcc
761 ASMALIGN(4)
762 "1: \n\t"
763 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI
764 "movhps 8(%0, %%"REG_D"), %%xmm0 \n\t" // RXXI
765 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // XXXi
766 "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi
767 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR
768 "movaps "MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm2\n\t"
769 "mulps %%xmm0, %%xmm2 \n\t"
770 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI
771 "mulps "MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
772 "subps %%xmm0, %%xmm2 \n\t"
773 "movzb (%%"REG_a"), %%"REG_d" \n\t"
774 "movzb 1(%%"REG_a"), %%"REG_BP" \n\t"
775 "movlps %%xmm2, (%1, %%"REG_d", 8) \n\t"
776 "movhps %%xmm2, (%1, %%"REG_BP", 8) \n\t"
777 "add $16, %%"REG_S" \n\t"
778 "add $2, %%"REG_a" \n\t" // avoid complex addressing for P4 crap
779 "sub $16, %%"REG_D" \n\t"
780 "jnc 1b \n\t"
781 "pop %%"REG_BP" \n\t"//no we didnt touch ebp *g*
782 :: "b" (data), "c" (buf)
783 : "%"REG_S, "%"REG_D, "%"REG_a, "%"REG_d
787 /* FFT Merge */
788 /* unoptimized variant
789 for (m=1; m < 7; m++) {
790 if(m)
791 two_m = (1 << m);
792 else
793 two_m = 1;
795 two_m_plus_one = (1 << (m+1));
797 for(i = 0; i < 128; i += two_m_plus_one) {
798 for(k = 0; k < two_m; k++) {
799 p = k + i;
800 q = p + two_m;
801 tmp_a_r = buf[p].real;
802 tmp_a_i = buf[p].imag;
803 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
804 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
805 buf[p].real = tmp_a_r + tmp_b_r;
806 buf[p].imag = tmp_a_i + tmp_b_i;
807 buf[q].real = tmp_a_r - tmp_b_r;
808 buf[q].imag = tmp_a_i - tmp_b_i;
814 /* 1. iteration */
815 // Note w[0][0]={1,0}
816 __asm__ volatile(
817 "xorps %%xmm1, %%xmm1 \n\t"
818 "xorps %%xmm2, %%xmm2 \n\t"
819 "mov %0, %%"REG_S" \n\t"
820 ASMALIGN(4)
821 "1: \n\t"
822 "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p]
823 "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q]
824 "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p]
825 "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q]
826 "addps %%xmm1, %%xmm0 \n\t"
827 "subps %%xmm2, %%xmm0 \n\t"
828 "movaps %%xmm0, (%%"REG_S")\n\t"
829 "add $16, %%"REG_S" \n\t"
830 "cmp %1, %%"REG_S" \n\t"
831 " jb 1b \n\t"
832 :: "g" (buf), "r" (buf + 128)
833 : "%"REG_S
836 /* 2. iteration */
837 // Note w[1]={{1,0}, {0,-1}}
838 __asm__ volatile(
839 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
840 "mov %0, %%"REG_S" \n\t"
841 ASMALIGN(4)
842 "1: \n\t"
843 "movaps 16(%%"REG_S"), %%xmm2 \n\t" //r2,i2,r3,i3
844 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3
845 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3
846 "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1
847 "movaps (%%"REG_S"), %%xmm1 \n\t" //r0,i0,r1,i1
848 "addps %%xmm2, %%xmm0 \n\t"
849 "subps %%xmm2, %%xmm1 \n\t"
850 "movaps %%xmm0, (%%"REG_S") \n\t"
851 "movaps %%xmm1, 16(%%"REG_S") \n\t"
852 "add $32, %%"REG_S" \n\t"
853 "cmp %1, %%"REG_S" \n\t"
854 " jb 1b \n\t"
855 :: "g" (buf), "r" (buf + 128)
856 : "%"REG_S
859 /* 3. iteration */
861 Note sseW2+0={1,1,sqrt(2),sqrt(2))
862 Note sseW2+16={0,0,sqrt(2),-sqrt(2))
863 Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
864 Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
866 __asm__ volatile(
867 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
868 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
869 "xorps %%xmm5, %%xmm5 \n\t"
870 "xorps %%xmm2, %%xmm2 \n\t"
871 "mov %0, %%"REG_S" \n\t"
872 ASMALIGN(4)
873 "1: \n\t"
874 "movaps 32(%%"REG_S"), %%xmm2 \n\t" //r4,i4,r5,i5
875 "movaps 48(%%"REG_S"), %%xmm3 \n\t" //r6,i6,r7,i7
876 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
877 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
878 "mulps %%xmm2, %%xmm4 \n\t"
879 "mulps %%xmm3, %%xmm5 \n\t"
880 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5
881 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7
882 "mulps %%xmm6, %%xmm3 \n\t"
883 "mulps %%xmm7, %%xmm2 \n\t"
884 "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1
885 "movaps 16(%%"REG_S"), %%xmm1 \n\t" //r2,i2,r3,i3
886 "addps %%xmm4, %%xmm2 \n\t"
887 "addps %%xmm5, %%xmm3 \n\t"
888 "movaps %%xmm2, %%xmm4 \n\t"
889 "movaps %%xmm3, %%xmm5 \n\t"
890 "addps %%xmm0, %%xmm2 \n\t"
891 "addps %%xmm1, %%xmm3 \n\t"
892 "subps %%xmm4, %%xmm0 \n\t"
893 "subps %%xmm5, %%xmm1 \n\t"
894 "movaps %%xmm2, (%%"REG_S") \n\t"
895 "movaps %%xmm3, 16(%%"REG_S") \n\t"
896 "movaps %%xmm0, 32(%%"REG_S") \n\t"
897 "movaps %%xmm1, 48(%%"REG_S") \n\t"
898 "add $64, %%"REG_S" \n\t"
899 "cmp %1, %%"REG_S" \n\t"
900 " jb 1b \n\t"
901 :: "g" (buf), "r" (buf + 128)
902 : "%"REG_S
905 /* 4-7. iterations */
906 for (m=3; m < 7; m++) {
907 two_m = (1 << m);
908 two_m_plus_one = two_m<<1;
909 two_m_plus_one_shl3 = (two_m_plus_one<<3);
910 buf_offset = buf+128;
911 __asm__ volatile(
912 "mov %0, %%"REG_S" \n\t"
913 ASMALIGN(4)
914 "1: \n\t"
915 "xor %%"REG_D", %%"REG_D" \n\t" // k
916 "lea (%%"REG_S", %3), %%"REG_d" \n\t"
917 "2: \n\t"
918 "movaps (%%"REG_d", %%"REG_D"), %%xmm1 \n\t"
919 "movaps (%4, %%"REG_D", 2), %%xmm2 \n\t"
920 "mulps %%xmm1, %%xmm2 \n\t"
921 "shufps $0xB1, %%xmm1, %%xmm1 \n\t"
922 "mulps 16(%4, %%"REG_D", 2), %%xmm1 \n\t"
923 "movaps (%%"REG_S", %%"REG_D"), %%xmm0 \n\t"
924 "addps %%xmm2, %%xmm1 \n\t"
925 "movaps %%xmm1, %%xmm2 \n\t"
926 "addps %%xmm0, %%xmm1 \n\t"
927 "subps %%xmm2, %%xmm0 \n\t"
928 "movaps %%xmm1, (%%"REG_S", %%"REG_D") \n\t"
929 "movaps %%xmm0, (%%"REG_d", %%"REG_D") \n\t"
930 "add $16, %%"REG_D" \n\t"
931 "cmp %3, %%"REG_D" \n\t" //FIXME (opt) count against 0
932 "jb 2b \n\t"
933 "add %2, %%"REG_S" \n\t"
934 "cmp %1, %%"REG_S" \n\t"
935 " jb 1b \n\t"
936 :: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3),
937 "r" (sseW[m])
938 : "%"REG_S, "%"REG_D, "%"REG_d
942 /* Post IFFT complex multiply plus IFFT complex conjugate*/
943 __asm__ volatile(
944 "mov $-1024, %%"REG_S" \n\t"
945 ASMALIGN(4)
946 "1: \n\t"
947 "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
948 "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
949 "shufps $0xB1, %%xmm0, %%xmm0 \n\t"
950 "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t"
951 "mulps 1024+"MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
952 "addps %%xmm1, %%xmm0 \n\t"
953 "movaps %%xmm0, (%0, %%"REG_S") \n\t"
954 "add $16, %%"REG_S" \n\t"
955 " jnz 1b \n\t"
956 :: "r" (buf+128)
957 : "%"REG_S
961 data_ptr = data;
962 delay_ptr = delay;
963 window_ptr = a52_imdct_window;
965 /* Window and convert to real valued signal */
966 __asm__ volatile(
967 "xor %%"REG_D", %%"REG_D" \n\t" // 0
968 "xor %%"REG_S", %%"REG_S" \n\t" // 0
969 "movss %3, %%xmm2 \n\t" // bias
970 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
971 ASMALIGN(4)
972 "1: \n\t"
973 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
974 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ?
975 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ?
976 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ?
977 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
978 "mulps "MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
979 "addps (%2, %%"REG_S"), %%xmm0 \n\t"
980 "addps %%xmm2, %%xmm0 \n\t"
981 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
982 "add $16, %%"REG_S" \n\t"
983 "sub $16, %%"REG_D" \n\t"
984 "cmp $512, %%"REG_S" \n\t"
985 " jb 1b \n\t"
986 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
987 : "%"REG_S, "%"REG_D
989 data_ptr+=128;
990 delay_ptr+=128;
991 // window_ptr+=128;
993 __asm__ volatile(
994 "mov $1024, %%"REG_D" \n\t" // 512
995 "xor %%"REG_S", %%"REG_S" \n\t" // 0
996 "movss %3, %%xmm2 \n\t" // bias
997 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
998 ASMALIGN(4)
999 "1: \n\t"
1000 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A
1001 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C
1002 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C
1003 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A
1004 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
1005 "mulps 512+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1006 "addps (%2, %%"REG_S"), %%xmm0 \n\t"
1007 "addps %%xmm2, %%xmm0 \n\t"
1008 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
1009 "add $16, %%"REG_S" \n\t"
1010 "sub $16, %%"REG_D" \n\t"
1011 "cmp $512, %%"REG_S" \n\t"
1012 " jb 1b \n\t"
1013 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1014 : "%"REG_S, "%"REG_D
1016 data_ptr+=128;
1017 // window_ptr+=128;
1019 /* The trailing edge of the window goes into the delay line */
1020 delay_ptr = delay;
1022 __asm__ volatile(
1023 "xor %%"REG_D", %%"REG_D" \n\t" // 0
1024 "xor %%"REG_S", %%"REG_S" \n\t" // 0
1025 ASMALIGN(4)
1026 "1: \n\t"
1027 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A
1028 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C
1029 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C
1030 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A
1031 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
1032 "mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1033 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
1034 "add $16, %%"REG_S" \n\t"
1035 "sub $16, %%"REG_D" \n\t"
1036 "cmp $512, %%"REG_S" \n\t"
1037 " jb 1b \n\t"
1038 :: "r" (buf+64), "r" (delay_ptr)
1039 : "%"REG_S, "%"REG_D
1041 delay_ptr+=128;
1042 // window_ptr-=128;
1044 __asm__ volatile(
1045 "mov $1024, %%"REG_D" \n\t" // 1024
1046 "xor %%"REG_S", %%"REG_S" \n\t" // 0
1047 ASMALIGN(4)
1048 "1: \n\t"
1049 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
1050 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ?
1051 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ?
1052 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ?
1053 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
1054 "mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1055 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
1056 "add $16, %%"REG_S" \n\t"
1057 "sub $16, %%"REG_D" \n\t"
1058 "cmp $512, %%"REG_S" \n\t"
1059 " jb 1b \n\t"
1060 :: "r" (buf), "r" (delay_ptr)
1061 : "%"REG_S, "%"REG_D
1064 #endif
1065 #endif // ARCH_X86 || ARCH_X86_64
1067 void a52_imdct_256(sample_t * data, sample_t * delay, sample_t bias)
1069 int i, k;
1070 sample_t t_r, t_i, a_r, a_i, b_r, b_i, c_r, c_i, d_r, d_i, w_1, w_2;
1071 const sample_t * window = a52_imdct_window;
1072 complex_t buf1[64], buf2[64];
1074 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
1075 for (i = 0; i < 64; i++) {
1076 k = fftorder[i];
1077 t_r = pre2[i].real;
1078 t_i = pre2[i].imag;
1080 buf1[i].real = t_i * data[254-k] + t_r * data[k];
1081 buf1[i].imag = t_r * data[254-k] - t_i * data[k];
1083 buf2[i].real = t_i * data[255-k] + t_r * data[k+1];
1084 buf2[i].imag = t_r * data[255-k] - t_i * data[k+1];
1087 ifft64 (buf1);
1088 ifft64 (buf2);
1090 /* Post IFFT complex multiply */
1091 /* Window and convert to real valued signal */
1092 for (i = 0; i < 32; i++) {
1093 /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
1094 t_r = post2[i].real;
1095 t_i = post2[i].imag;
1097 a_r = t_r * buf1[i].real + t_i * buf1[i].imag;
1098 a_i = t_i * buf1[i].real - t_r * buf1[i].imag;
1099 b_r = t_i * buf1[63-i].real + t_r * buf1[63-i].imag;
1100 b_i = t_r * buf1[63-i].real - t_i * buf1[63-i].imag;
1102 c_r = t_r * buf2[i].real + t_i * buf2[i].imag;
1103 c_i = t_i * buf2[i].real - t_r * buf2[i].imag;
1104 d_r = t_i * buf2[63-i].real + t_r * buf2[63-i].imag;
1105 d_i = t_r * buf2[63-i].real - t_i * buf2[63-i].imag;
1107 w_1 = window[2*i];
1108 w_2 = window[255-2*i];
1109 data[2*i] = delay[2*i] * w_2 - a_r * w_1 + bias;
1110 data[255-2*i] = delay[2*i] * w_1 + a_r * w_2 + bias;
1111 delay[2*i] = c_i;
1113 w_1 = window[128+2*i];
1114 w_2 = window[127-2*i];
1115 data[128+2*i] = delay[127-2*i] * w_2 + a_i * w_1 + bias;
1116 data[127-2*i] = delay[127-2*i] * w_1 - a_i * w_2 + bias;
1117 delay[127-2*i] = c_r;
1119 w_1 = window[2*i+1];
1120 w_2 = window[254-2*i];
1121 data[2*i+1] = delay[2*i+1] * w_2 - b_i * w_1 + bias;
1122 data[254-2*i] = delay[2*i+1] * w_1 + b_i * w_2 + bias;
1123 delay[2*i+1] = d_r;
1125 w_1 = window[129+2*i];
1126 w_2 = window[126-2*i];
1127 data[129+2*i] = delay[126-2*i] * w_2 + b_r * w_1 + bias;
1128 data[126-2*i] = delay[126-2*i] * w_1 - b_r * w_2 + bias;
1129 delay[126-2*i] = d_i;
1133 static double besselI0 (double x)
1135 double bessel = 1;
1136 int i = 100;
1139 bessel = bessel * x / (i * i) + 1;
1140 while (--i);
1141 return bessel;
1144 void a52_imdct_init (uint32_t mm_accel)
1146 int i, j, k;
1147 double sum;
1149 /* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */
1150 sum = 0;
1151 for (i = 0; i < 256; i++) {
1152 sum += besselI0 (i * (256 - i) * (5 * M_PI / 256) * (5 * M_PI / 256));
1153 a52_imdct_window[i] = sum;
1155 sum++;
1156 for (i = 0; i < 256; i++)
1157 a52_imdct_window[i] = sqrt (a52_imdct_window[i] / sum);
1159 for (i = 0; i < 3; i++)
1160 roots16[i] = cos ((M_PI / 8) * (i + 1));
1162 for (i = 0; i < 7; i++)
1163 roots32[i] = cos ((M_PI / 16) * (i + 1));
1165 for (i = 0; i < 15; i++)
1166 roots64[i] = cos ((M_PI / 32) * (i + 1));
1168 for (i = 0; i < 31; i++)
1169 roots128[i] = cos ((M_PI / 64) * (i + 1));
1171 for (i = 0; i < 64; i++) {
1172 k = fftorder[i] / 2 + 64;
1173 pre1[i].real = cos ((M_PI / 256) * (k - 0.25));
1174 pre1[i].imag = sin ((M_PI / 256) * (k - 0.25));
1177 for (i = 64; i < 128; i++) {
1178 k = fftorder[i] / 2 + 64;
1179 pre1[i].real = -cos ((M_PI / 256) * (k - 0.25));
1180 pre1[i].imag = -sin ((M_PI / 256) * (k - 0.25));
1183 for (i = 0; i < 64; i++) {
1184 post1[i].real = cos ((M_PI / 256) * (i + 0.5));
1185 post1[i].imag = sin ((M_PI / 256) * (i + 0.5));
1188 for (i = 0; i < 64; i++) {
1189 k = fftorder[i] / 4;
1190 pre2[i].real = cos ((M_PI / 128) * (k - 0.25));
1191 pre2[i].imag = sin ((M_PI / 128) * (k - 0.25));
1194 for (i = 0; i < 32; i++) {
1195 post2[i].real = cos ((M_PI / 128) * (i + 0.5));
1196 post2[i].imag = sin ((M_PI / 128) * (i + 0.5));
1198 for (i = 0; i < 128; i++) {
1199 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
1200 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
1202 for (i = 0; i < 7; i++) {
1203 j = 1 << i;
1204 for (k = 0; k < j; k++) {
1205 w[i][k].real = cos (-M_PI * k / j);
1206 w[i][k].imag = sin (-M_PI * k / j);
1209 #if ARCH_X86 || ARCH_X86_64
1210 for (i = 0; i < 128; i++) {
1211 sseSinCos1c[2*i+0]= xcos1[i];
1212 sseSinCos1c[2*i+1]= -xcos1[i];
1213 sseSinCos1d[2*i+0]= xsin1[i];
1214 sseSinCos1d[2*i+1]= xsin1[i];
1216 for (i = 1; i < 7; i++) {
1217 j = 1 << i;
1218 for (k = 0; k < j; k+=2) {
1220 sseW[i][4*k + 0] = w[i][k+0].real;
1221 sseW[i][4*k + 1] = w[i][k+0].real;
1222 sseW[i][4*k + 2] = w[i][k+1].real;
1223 sseW[i][4*k + 3] = w[i][k+1].real;
1225 sseW[i][4*k + 4] = -w[i][k+0].imag;
1226 sseW[i][4*k + 5] = w[i][k+0].imag;
1227 sseW[i][4*k + 6] = -w[i][k+1].imag;
1228 sseW[i][4*k + 7] = w[i][k+1].imag;
1230 //we multiply more or less uninitalized numbers so we need to use exactly 0.0
1231 if(k==0)
1233 // sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
1234 sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
1237 if(2*k == j)
1239 sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
1240 // sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
1245 for(i=0; i<128; i++)
1247 sseWindow[2*i+0]= -a52_imdct_window[2*i+0];
1248 sseWindow[2*i+1]= a52_imdct_window[2*i+1];
1251 for(i=0; i<64; i++)
1253 sseWindow[256 + 2*i+0]= -a52_imdct_window[254 - 2*i+1];
1254 sseWindow[256 + 2*i+1]= a52_imdct_window[254 - 2*i+0];
1255 sseWindow[384 + 2*i+0]= a52_imdct_window[126 - 2*i+1];
1256 sseWindow[384 + 2*i+1]= -a52_imdct_window[126 - 2*i+0];
1258 #endif
1259 a52_imdct_512 = imdct_do_512;
1260 ifft128 = ifft128_c;
1261 ifft64 = ifft64_c;
1263 #if !ARCH_X86_64 || !defined(PIC)
1264 /* if(mm_accel & MM_ACCEL_X86_SSE)
1266 fprintf (stderr, "Using SSE optimized IMDCT transform\n");
1267 a52_imdct_512 = imdct_do_512_sse;
1269 else*/
1270 if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
1272 fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
1273 a52_imdct_512 = imdct_do_512_3dnowex;
1275 else
1276 if(mm_accel & MM_ACCEL_X86_3DNOW)
1278 fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
1279 a52_imdct_512 = imdct_do_512_3dnow;
1281 else
1282 #endif // ARCH_X86 || ARCH_X86_64
1283 #if HAVE_ALTIVEC
1284 if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
1286 fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
1287 a52_imdct_512 = imdct_do_512_altivec;
1289 else
1290 #endif
1292 #ifdef LIBA52_DJBFFT
1293 if (mm_accel & MM_ACCEL_DJBFFT) {
1294 fprintf (stderr, "Using djbfft for IMDCT transform\n");
1295 ifft128 = (void (*) (complex_t *)) fftc4_un128;
1296 ifft64 = (void (*) (complex_t *)) fftc4_un64;
1297 } else
1298 #endif
1300 fprintf (stderr, "No accelerated IMDCT transform found\n");