Remove the grayscale and 4:2:2 RTjpeg code, it is neither used nor is there
[mplayer/glamo.git] / liba52 / imdct.c
blobe16038855e577d338b00e7fcb3c388789bac664a
1 /*
2 * imdct.c
3 * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
6 * The ifft algorithms in this file have been largely inspired by Dan
7 * Bernstein's work, djbfft, available at http://cr.yp.to/djbfft.html
9 * This file is part of a52dec, a free ATSC A-52 stream decoder.
10 * See http://liba52.sourceforge.net/ for updates.
12 * Modified for use with MPlayer, changes contained in liba52_changes.diff.
13 * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/
14 * $Id$
16 * a52dec is free software; you can redistribute it and/or modify
17 * it under the terms of the GNU General Public License as published by
18 * the Free Software Foundation; either version 2 of the License, or
19 * (at your option) any later version.
21 * a52dec is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU General Public License for more details.
26 * You should have received a copy of the GNU General Public License
27 * along with this program; if not, write to the Free Software
28 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
31 * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
32 * michael did port them from libac3 (untested, perhaps totally broken)
33 * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
36 #include "config.h"
38 #include <math.h>
39 #include <stdio.h>
40 #ifdef LIBA52_DJBFFT
41 #include <fftc4.h>
42 #endif
43 #ifndef M_PI
44 #define M_PI 3.1415926535897932384626433832795029
45 #endif
46 #include <inttypes.h>
48 #include "a52.h"
49 #include "a52_internal.h"
50 #include "mm_accel.h"
51 #include "mangle.h"
53 void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias);
55 #ifdef RUNTIME_CPUDETECT
56 #undef HAVE_AMD3DNOWEXT
57 #define HAVE_AMD3DNOWEXT 0
58 #endif
60 typedef struct complex_s {
61 sample_t real;
62 sample_t imag;
63 } complex_t;
65 static const int pm128[128] attribute_used __attribute__((aligned(16))) =
67 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
68 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,
69 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,
70 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,
71 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,
72 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
73 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
74 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
75 };
77 static uint8_t attribute_used bit_reverse_512[] = {
78 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
79 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
80 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
81 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
82 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
83 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
84 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
85 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
86 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
87 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
88 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
89 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
90 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
91 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
92 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
93 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
95 static uint8_t fftorder[] = {
96 0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176,
97 8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88,
98 4,132, 68,196, 36,164,228,100, 20,148, 84,212,244,116, 52,180,
99 252,124, 60,188, 28,156,220, 92, 12,140, 76,204,236,108, 44,172,
100 2,130, 66,194, 34,162,226, 98, 18,146, 82,210,242,114, 50,178,
101 10,138, 74,202, 42,170,234,106,250,122, 58,186, 26,154,218, 90,
102 254,126, 62,190, 30,158,222, 94, 14,142, 78,206,238,110, 46,174,
103 6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86
106 static complex_t __attribute__((aligned(16))) buf[128];
108 /* Twiddle factor LUT */
109 static complex_t __attribute__((aligned(16))) w_1[1];
110 static complex_t __attribute__((aligned(16))) w_2[2];
111 static complex_t __attribute__((aligned(16))) w_4[4];
112 static complex_t __attribute__((aligned(16))) w_8[8];
113 static complex_t __attribute__((aligned(16))) w_16[16];
114 static complex_t __attribute__((aligned(16))) w_32[32];
115 static complex_t __attribute__((aligned(16))) w_64[64];
116 static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
118 /* Twiddle factors for IMDCT */
119 static sample_t __attribute__((aligned(16))) xcos1[128];
120 static sample_t __attribute__((aligned(16))) xsin1[128];
122 #if ARCH_X86 || ARCH_X86_64
123 // NOTE: SSE needs 16byte alignment or it will segfault
125 static float __attribute__((aligned(16))) sseSinCos1c[256];
126 static float __attribute__((aligned(16))) sseSinCos1d[256];
127 static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
128 //static float __attribute__((aligned(16))) sseW0[4];
129 static float __attribute__((aligned(16))) sseW1[8];
130 static float __attribute__((aligned(16))) sseW2[16];
131 static float __attribute__((aligned(16))) sseW3[32];
132 static float __attribute__((aligned(16))) sseW4[64];
133 static float __attribute__((aligned(16))) sseW5[128];
134 static float __attribute__((aligned(16))) sseW6[256];
135 static float __attribute__((aligned(16))) *sseW[7]=
136 {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
137 static float __attribute__((aligned(16))) sseWindow[512];
138 #endif
140 /* Root values for IFFT */
141 static sample_t roots16[3];
142 static sample_t roots32[7];
143 static sample_t roots64[15];
144 static sample_t roots128[31];
146 /* Twiddle factors for IMDCT */
147 static complex_t pre1[128];
148 static complex_t post1[64];
149 static complex_t pre2[64];
150 static complex_t post2[32];
152 static sample_t a52_imdct_window[256];
154 static void (* ifft128) (complex_t * buf);
155 static void (* ifft64) (complex_t * buf);
157 static inline void ifft2 (complex_t * buf)
159 double r, i;
161 r = buf[0].real;
162 i = buf[0].imag;
163 buf[0].real += buf[1].real;
164 buf[0].imag += buf[1].imag;
165 buf[1].real = r - buf[1].real;
166 buf[1].imag = i - buf[1].imag;
169 static inline void ifft4 (complex_t * buf)
171 double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
173 tmp1 = buf[0].real + buf[1].real;
174 tmp2 = buf[3].real + buf[2].real;
175 tmp3 = buf[0].imag + buf[1].imag;
176 tmp4 = buf[2].imag + buf[3].imag;
177 tmp5 = buf[0].real - buf[1].real;
178 tmp6 = buf[0].imag - buf[1].imag;
179 tmp7 = buf[2].imag - buf[3].imag;
180 tmp8 = buf[3].real - buf[2].real;
182 buf[0].real = tmp1 + tmp2;
183 buf[0].imag = tmp3 + tmp4;
184 buf[2].real = tmp1 - tmp2;
185 buf[2].imag = tmp3 - tmp4;
186 buf[1].real = tmp5 + tmp7;
187 buf[1].imag = tmp6 + tmp8;
188 buf[3].real = tmp5 - tmp7;
189 buf[3].imag = tmp6 - tmp8;
192 /* the basic split-radix ifft butterfly */
194 #define BUTTERFLY(a0,a1,a2,a3,wr,wi) do { \
195 tmp5 = a2.real * wr + a2.imag * wi; \
196 tmp6 = a2.imag * wr - a2.real * wi; \
197 tmp7 = a3.real * wr - a3.imag * wi; \
198 tmp8 = a3.imag * wr + a3.real * wi; \
199 tmp1 = tmp5 + tmp7; \
200 tmp2 = tmp6 + tmp8; \
201 tmp3 = tmp6 - tmp8; \
202 tmp4 = tmp7 - tmp5; \
203 a2.real = a0.real - tmp1; \
204 a2.imag = a0.imag - tmp2; \
205 a3.real = a1.real - tmp3; \
206 a3.imag = a1.imag - tmp4; \
207 a0.real += tmp1; \
208 a0.imag += tmp2; \
209 a1.real += tmp3; \
210 a1.imag += tmp4; \
211 } while (0)
213 /* split-radix ifft butterfly, specialized for wr=1 wi=0 */
215 #define BUTTERFLY_ZERO(a0,a1,a2,a3) do { \
216 tmp1 = a2.real + a3.real; \
217 tmp2 = a2.imag + a3.imag; \
218 tmp3 = a2.imag - a3.imag; \
219 tmp4 = a3.real - a2.real; \
220 a2.real = a0.real - tmp1; \
221 a2.imag = a0.imag - tmp2; \
222 a3.real = a1.real - tmp3; \
223 a3.imag = a1.imag - tmp4; \
224 a0.real += tmp1; \
225 a0.imag += tmp2; \
226 a1.real += tmp3; \
227 a1.imag += tmp4; \
228 } while (0)
230 /* split-radix ifft butterfly, specialized for wr=wi */
232 #define BUTTERFLY_HALF(a0,a1,a2,a3,w) do { \
233 tmp5 = (a2.real + a2.imag) * w; \
234 tmp6 = (a2.imag - a2.real) * w; \
235 tmp7 = (a3.real - a3.imag) * w; \
236 tmp8 = (a3.imag + a3.real) * w; \
237 tmp1 = tmp5 + tmp7; \
238 tmp2 = tmp6 + tmp8; \
239 tmp3 = tmp6 - tmp8; \
240 tmp4 = tmp7 - tmp5; \
241 a2.real = a0.real - tmp1; \
242 a2.imag = a0.imag - tmp2; \
243 a3.real = a1.real - tmp3; \
244 a3.imag = a1.imag - tmp4; \
245 a0.real += tmp1; \
246 a0.imag += tmp2; \
247 a1.real += tmp3; \
248 a1.imag += tmp4; \
249 } while (0)
251 static inline void ifft8 (complex_t * buf)
253 double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
255 ifft4 (buf);
256 ifft2 (buf + 4);
257 ifft2 (buf + 6);
258 BUTTERFLY_ZERO (buf[0], buf[2], buf[4], buf[6]);
259 BUTTERFLY_HALF (buf[1], buf[3], buf[5], buf[7], roots16[1]);
262 static void ifft_pass (complex_t * buf, sample_t * weight, int n)
264 complex_t * buf1;
265 complex_t * buf2;
266 complex_t * buf3;
267 double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
268 int i;
270 buf++;
271 buf1 = buf + n;
272 buf2 = buf + 2 * n;
273 buf3 = buf + 3 * n;
275 BUTTERFLY_ZERO (buf[-1], buf1[-1], buf2[-1], buf3[-1]);
277 i = n - 1;
279 do {
280 BUTTERFLY (buf[0], buf1[0], buf2[0], buf3[0], weight[n], weight[2*i]);
281 buf++;
282 buf1++;
283 buf2++;
284 buf3++;
285 weight++;
286 } while (--i);
289 static void ifft16 (complex_t * buf)
291 ifft8 (buf);
292 ifft4 (buf + 8);
293 ifft4 (buf + 12);
294 ifft_pass (buf, roots16 - 4, 4);
297 static void ifft32 (complex_t * buf)
299 ifft16 (buf);
300 ifft8 (buf + 16);
301 ifft8 (buf + 24);
302 ifft_pass (buf, roots32 - 8, 8);
305 static void ifft64_c (complex_t * buf)
307 ifft32 (buf);
308 ifft16 (buf + 32);
309 ifft16 (buf + 48);
310 ifft_pass (buf, roots64 - 16, 16);
313 static void ifft128_c (complex_t * buf)
315 ifft32 (buf);
316 ifft16 (buf + 32);
317 ifft16 (buf + 48);
318 ifft_pass (buf, roots64 - 16, 16);
320 ifft32 (buf + 64);
321 ifft32 (buf + 96);
322 ifft_pass (buf, roots128 - 32, 32);
325 void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias)
327 int i, k;
328 sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2;
329 const sample_t * window = a52_imdct_window;
330 complex_t buf[128];
332 for (i = 0; i < 128; i++) {
333 k = fftorder[i];
334 t_r = pre1[i].real;
335 t_i = pre1[i].imag;
337 buf[i].real = t_i * data[255-k] + t_r * data[k];
338 buf[i].imag = t_r * data[255-k] - t_i * data[k];
341 ifft128 (buf);
343 /* Post IFFT complex multiply plus IFFT complex conjugate*/
344 /* Window and convert to real valued signal */
345 for (i = 0; i < 64; i++) {
346 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
347 t_r = post1[i].real;
348 t_i = post1[i].imag;
350 a_r = t_r * buf[i].real + t_i * buf[i].imag;
351 a_i = t_i * buf[i].real - t_r * buf[i].imag;
352 b_r = t_i * buf[127-i].real + t_r * buf[127-i].imag;
353 b_i = t_r * buf[127-i].real - t_i * buf[127-i].imag;
355 w_1 = window[2*i];
356 w_2 = window[255-2*i];
357 data[2*i] = delay[2*i] * w_2 - a_r * w_1 + bias;
358 data[255-2*i] = delay[2*i] * w_1 + a_r * w_2 + bias;
359 delay[2*i] = a_i;
361 w_1 = window[2*i+1];
362 w_2 = window[254-2*i];
363 data[2*i+1] = delay[2*i+1] * w_2 + b_r * w_1 + bias;
364 data[254-2*i] = delay[2*i+1] * w_1 - b_r * w_2 + bias;
365 delay[2*i+1] = b_i;
369 #if HAVE_ALTIVEC
371 #ifdef HAVE_ALTIVEC_H
372 #include <altivec.h>
373 #endif
375 // used to build registers permutation vectors (vcprm)
376 // the 's' are for words in the _s_econd vector
377 #define WORD_0 0x00,0x01,0x02,0x03
378 #define WORD_1 0x04,0x05,0x06,0x07
379 #define WORD_2 0x08,0x09,0x0a,0x0b
380 #define WORD_3 0x0c,0x0d,0x0e,0x0f
381 #define WORD_s0 0x10,0x11,0x12,0x13
382 #define WORD_s1 0x14,0x15,0x16,0x17
383 #define WORD_s2 0x18,0x19,0x1a,0x1b
384 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
386 #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
387 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
389 #define FOUROF(a) {a,a,a,a}
391 // vcprmle is used to keep the same index as in the SSE version.
392 // it's the same as vcprm, with the index inversed
393 // ('le' is Little Endian)
394 #define vcprmle(a,b,c,d) vcprm(d,c,b,a)
396 // used to build inverse/identity vectors (vcii)
397 // n is _n_egative, p is _p_ositive
398 #define FLOAT_n -1.
399 #define FLOAT_p 1.
402 void
403 imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
405 int i;
406 int k;
407 int p,q;
408 int m;
409 long two_m;
410 long two_m_plus_one;
412 sample_t tmp_b_i;
413 sample_t tmp_b_r;
414 sample_t tmp_a_i;
415 sample_t tmp_a_r;
417 sample_t *data_ptr;
418 sample_t *delay_ptr;
419 sample_t *window_ptr;
421 /* 512 IMDCT with source and dest data in 'data' */
423 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
424 for( i=0; i < 128; i++) {
425 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
426 int j= bit_reverse_512[i];
427 buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
428 buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
431 /* 1. iteration */
432 for(i = 0; i < 128; i += 2) {
433 #if 0
434 tmp_a_r = buf[i].real;
435 tmp_a_i = buf[i].imag;
436 tmp_b_r = buf[i+1].real;
437 tmp_b_i = buf[i+1].imag;
438 buf[i].real = tmp_a_r + tmp_b_r;
439 buf[i].imag = tmp_a_i + tmp_b_i;
440 buf[i+1].real = tmp_a_r - tmp_b_r;
441 buf[i+1].imag = tmp_a_i - tmp_b_i;
442 #else
443 vector float temp, bufv;
445 bufv = vec_ld(i << 3, (float*)buf);
446 temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
447 bufv = vec_madd(bufv, vcii(p,p,n,n), temp);
448 vec_st(bufv, i << 3, (float*)buf);
449 #endif
452 /* 2. iteration */
453 // Note w[1]={{1,0}, {0,-1}}
454 for(i = 0; i < 128; i += 4) {
455 #if 0
456 tmp_a_r = buf[i].real;
457 tmp_a_i = buf[i].imag;
458 tmp_b_r = buf[i+2].real;
459 tmp_b_i = buf[i+2].imag;
460 buf[i].real = tmp_a_r + tmp_b_r;
461 buf[i].imag = tmp_a_i + tmp_b_i;
462 buf[i+2].real = tmp_a_r - tmp_b_r;
463 buf[i+2].imag = tmp_a_i - tmp_b_i;
464 tmp_a_r = buf[i+1].real;
465 tmp_a_i = buf[i+1].imag;
466 /* WARNING: im <-> re here ! */
467 tmp_b_r = buf[i+3].imag;
468 tmp_b_i = buf[i+3].real;
469 buf[i+1].real = tmp_a_r + tmp_b_r;
470 buf[i+1].imag = tmp_a_i - tmp_b_i;
471 buf[i+3].real = tmp_a_r - tmp_b_r;
472 buf[i+3].imag = tmp_a_i + tmp_b_i;
473 #else
474 vector float buf01, buf23, temp1, temp2;
476 buf01 = vec_ld((i + 0) << 3, (float*)buf);
477 buf23 = vec_ld((i + 2) << 3, (float*)buf);
478 buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
480 temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);
481 temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);
483 vec_st(temp1, (i + 0) << 3, (float*)buf);
484 vec_st(temp2, (i + 2) << 3, (float*)buf);
485 #endif
488 /* 3. iteration */
489 for(i = 0; i < 128; i += 8) {
490 #if 0
491 tmp_a_r = buf[i].real;
492 tmp_a_i = buf[i].imag;
493 tmp_b_r = buf[i+4].real;
494 tmp_b_i = buf[i+4].imag;
495 buf[i].real = tmp_a_r + tmp_b_r;
496 buf[i].imag = tmp_a_i + tmp_b_i;
497 buf[i+4].real = tmp_a_r - tmp_b_r;
498 buf[i+4].imag = tmp_a_i - tmp_b_i;
499 tmp_a_r = buf[1+i].real;
500 tmp_a_i = buf[1+i].imag;
501 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
502 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
503 buf[1+i].real = tmp_a_r + tmp_b_r;
504 buf[1+i].imag = tmp_a_i + tmp_b_i;
505 buf[i+5].real = tmp_a_r - tmp_b_r;
506 buf[i+5].imag = tmp_a_i - tmp_b_i;
507 tmp_a_r = buf[i+2].real;
508 tmp_a_i = buf[i+2].imag;
509 /* WARNING re <-> im & sign */
510 tmp_b_r = buf[i+6].imag;
511 tmp_b_i = - buf[i+6].real;
512 buf[i+2].real = tmp_a_r + tmp_b_r;
513 buf[i+2].imag = tmp_a_i + tmp_b_i;
514 buf[i+6].real = tmp_a_r - tmp_b_r;
515 buf[i+6].imag = tmp_a_i - tmp_b_i;
516 tmp_a_r = buf[i+3].real;
517 tmp_a_i = buf[i+3].imag;
518 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
519 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
520 buf[i+3].real = tmp_a_r + tmp_b_r;
521 buf[i+3].imag = tmp_a_i + tmp_b_i;
522 buf[i+7].real = tmp_a_r - tmp_b_r;
523 buf[i+7].imag = tmp_a_i - tmp_b_i;
524 #else
525 vector float buf01, buf23, buf45, buf67;
527 buf01 = vec_ld((i + 0) << 3, (float*)buf);
528 buf23 = vec_ld((i + 2) << 3, (float*)buf);
530 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
531 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
532 buf[i+5].real = tmp_b_r;
533 buf[i+5].imag = tmp_b_i;
534 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
535 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
536 buf[i+7].real = tmp_b_r;
537 buf[i+7].imag = tmp_b_i;
539 buf23 = vec_ld((i + 2) << 3, (float*)buf);
540 buf45 = vec_ld((i + 4) << 3, (float*)buf);
541 buf67 = vec_ld((i + 6) << 3, (float*)buf);
542 buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
544 vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
545 vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
546 vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
547 vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
548 #endif
551 /* 4-7. iterations */
552 for (m=3; m < 7; m++) {
553 two_m = (1 << m);
555 two_m_plus_one = two_m<<1;
557 for(i = 0; i < 128; i += two_m_plus_one) {
558 for(k = 0; k < two_m; k+=2) {
559 #if 0
560 int p = k + i;
561 int q = p + two_m;
562 tmp_a_r = buf[p].real;
563 tmp_a_i = buf[p].imag;
564 tmp_b_r =
565 buf[q].real * w[m][k].real -
566 buf[q].imag * w[m][k].imag;
567 tmp_b_i =
568 buf[q].imag * w[m][k].real +
569 buf[q].real * w[m][k].imag;
570 buf[p].real = tmp_a_r + tmp_b_r;
571 buf[p].imag = tmp_a_i + tmp_b_i;
572 buf[q].real = tmp_a_r - tmp_b_r;
573 buf[q].imag = tmp_a_i - tmp_b_i;
575 tmp_a_r = buf[(p + 1)].real;
576 tmp_a_i = buf[(p + 1)].imag;
577 tmp_b_r =
578 buf[(q + 1)].real * w[m][(k + 1)].real -
579 buf[(q + 1)].imag * w[m][(k + 1)].imag;
580 tmp_b_i =
581 buf[(q + 1)].imag * w[m][(k + 1)].real +
582 buf[(q + 1)].real * w[m][(k + 1)].imag;
583 buf[(p + 1)].real = tmp_a_r + tmp_b_r;
584 buf[(p + 1)].imag = tmp_a_i + tmp_b_i;
585 buf[(q + 1)].real = tmp_a_r - tmp_b_r;
586 buf[(q + 1)].imag = tmp_a_i - tmp_b_i;
587 #else
588 int p = k + i;
589 int q = p + two_m;
590 vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;
591 const vector float vczero = (const vector float)FOUROF(0.);
592 // first compute buf[q] and buf[q+1]
593 vecq = vec_ld(q << 3, (float*)buf);
594 vecw = vec_ld(0, (float*)&(w[m][k]));
595 temp1 = vec_madd(vecq, vecw, vczero);
596 temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));
597 temp2 = vec_madd(temp2, vecw, vczero);
598 temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));
599 temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));
600 vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
601 // then butterfly with buf[p] and buf[p+1]
602 vecp = vec_ld(p << 3, (float*)buf);
604 temp1 = vec_add(vecp, vecq);
605 temp2 = vec_sub(vecp, vecq);
607 vec_st(temp1, p << 3, (float*)buf);
608 vec_st(temp2, q << 3, (float*)buf);
609 #endif
614 /* Post IFFT complex multiply plus IFFT complex conjugate*/
615 for( i=0; i < 128; i+=4) {
616 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
617 #if 0
618 tmp_a_r = buf[(i + 0)].real;
619 tmp_a_i = -1.0 * buf[(i + 0)].imag;
620 buf[(i + 0)].real =
621 (tmp_a_r * xcos1[(i + 0)]) - (tmp_a_i * xsin1[(i + 0)]);
622 buf[(i + 0)].imag =
623 (tmp_a_r * xsin1[(i + 0)]) + (tmp_a_i * xcos1[(i + 0)]);
625 tmp_a_r = buf[(i + 1)].real;
626 tmp_a_i = -1.0 * buf[(i + 1)].imag;
627 buf[(i + 1)].real =
628 (tmp_a_r * xcos1[(i + 1)]) - (tmp_a_i * xsin1[(i + 1)]);
629 buf[(i + 1)].imag =
630 (tmp_a_r * xsin1[(i + 1)]) + (tmp_a_i * xcos1[(i + 1)]);
632 tmp_a_r = buf[(i + 2)].real;
633 tmp_a_i = -1.0 * buf[(i + 2)].imag;
634 buf[(i + 2)].real =
635 (tmp_a_r * xcos1[(i + 2)]) - (tmp_a_i * xsin1[(i + 2)]);
636 buf[(i + 2)].imag =
637 (tmp_a_r * xsin1[(i + 2)]) + (tmp_a_i * xcos1[(i + 2)]);
639 tmp_a_r = buf[(i + 3)].real;
640 tmp_a_i = -1.0 * buf[(i + 3)].imag;
641 buf[(i + 3)].real =
642 (tmp_a_r * xcos1[(i + 3)]) - (tmp_a_i * xsin1[(i + 3)]);
643 buf[(i + 3)].imag =
644 (tmp_a_r * xsin1[(i + 3)]) + (tmp_a_i * xcos1[(i + 3)]);
645 #else
646 vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;
647 vector float temp0022, temp1133, tempCS01;
648 const vector float vczero = (const vector float)FOUROF(0.);
650 bufv_0 = vec_ld((i + 0) << 3, (float*)buf);
651 bufv_2 = vec_ld((i + 2) << 3, (float*)buf);
653 cosv = vec_ld(i << 2, xcos1);
654 sinv = vec_ld(i << 2, xsin1);
656 temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));
657 temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));
658 tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));
659 temp1 = vec_madd(temp0022, tempCS01, vczero);
660 tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
661 temp2 = vec_madd(temp1133, tempCS01, vczero);
662 bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
664 vec_st(bufv_0, (i + 0) << 3, (float*)buf);
666 /* idem with bufv_2 and high-order cosv/sinv */
668 temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));
669 temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));
670 tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));
671 temp1 = vec_madd(temp0022, tempCS01, vczero);
672 tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));
673 temp2 = vec_madd(temp1133, tempCS01, vczero);
674 bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
676 vec_st(bufv_2, (i + 2) << 3, (float*)buf);
678 #endif
681 data_ptr = data;
682 delay_ptr = delay;
683 window_ptr = a52_imdct_window;
685 /* Window and convert to real valued signal */
686 for(i=0; i< 64; i++) {
687 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
688 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
691 for(i=0; i< 64; i++) {
692 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
693 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
696 /* The trailing edge of the window goes into the delay line */
697 delay_ptr = delay;
699 for(i=0; i< 64; i++) {
700 *delay_ptr++ = -buf[64+i].real * *--window_ptr;
701 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
704 for(i=0; i<64; i++) {
705 *delay_ptr++ = buf[i].imag * *--window_ptr;
706 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
709 #endif
712 // Stuff below this line is borrowed from libac3
713 #include "srfftp.h"
714 #if ARCH_X86 || ARCH_X86_64
715 #undef HAVE_AMD3DNOW
716 #define HAVE_AMD3DNOW 1
717 #include "srfftp_3dnow.h"
719 const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
720 const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
721 const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
723 #undef HAVE_AMD3DNOWEXT
724 #define HAVE_AMD3DNOWEXT 0
725 #include "imdct_3dnow.h"
726 #undef HAVE_AMD3DNOWEXT
727 #define HAVE_AMD3DNOWEXT 1
728 #include "imdct_3dnow.h"
730 void
731 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
733 /* int i,k;
734 int p,q;*/
735 int m;
736 long two_m;
737 long two_m_plus_one;
738 long two_m_plus_one_shl3;
739 complex_t *buf_offset;
741 /* sample_t tmp_a_i;
742 sample_t tmp_a_r;
743 sample_t tmp_b_i;
744 sample_t tmp_b_r;*/
746 sample_t *data_ptr;
747 sample_t *delay_ptr;
748 sample_t *window_ptr;
750 /* 512 IMDCT with source and dest data in 'data' */
751 /* see the c version (dct_do_512()), its allmost identical, just in C */
753 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
754 /* Bit reversed shuffling */
755 __asm__ volatile(
756 "xor %%"REG_S", %%"REG_S" \n\t"
757 "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t"
758 "mov $1008, %%"REG_D" \n\t"
759 "push %%"REG_BP" \n\t" //use ebp without telling gcc
760 ASMALIGN(4)
761 "1: \n\t"
762 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI
763 "movhps 8(%0, %%"REG_D"), %%xmm0 \n\t" // RXXI
764 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // XXXi
765 "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi
766 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR
767 "movaps "MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm2\n\t"
768 "mulps %%xmm0, %%xmm2 \n\t"
769 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI
770 "mulps "MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
771 "subps %%xmm0, %%xmm2 \n\t"
772 "movzb (%%"REG_a"), %%"REG_d" \n\t"
773 "movzb 1(%%"REG_a"), %%"REG_BP" \n\t"
774 "movlps %%xmm2, (%1, %%"REG_d", 8) \n\t"
775 "movhps %%xmm2, (%1, %%"REG_BP", 8) \n\t"
776 "add $16, %%"REG_S" \n\t"
777 "add $2, %%"REG_a" \n\t" // avoid complex addressing for P4 crap
778 "sub $16, %%"REG_D" \n\t"
779 "jnc 1b \n\t"
780 "pop %%"REG_BP" \n\t"//no we didnt touch ebp *g*
781 :: "b" (data), "c" (buf)
782 : "%"REG_S, "%"REG_D, "%"REG_a, "%"REG_d
786 /* FFT Merge */
787 /* unoptimized variant
788 for (m=1; m < 7; m++) {
789 if(m)
790 two_m = (1 << m);
791 else
792 two_m = 1;
794 two_m_plus_one = (1 << (m+1));
796 for(i = 0; i < 128; i += two_m_plus_one) {
797 for(k = 0; k < two_m; k++) {
798 p = k + i;
799 q = p + two_m;
800 tmp_a_r = buf[p].real;
801 tmp_a_i = buf[p].imag;
802 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
803 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
804 buf[p].real = tmp_a_r + tmp_b_r;
805 buf[p].imag = tmp_a_i + tmp_b_i;
806 buf[q].real = tmp_a_r - tmp_b_r;
807 buf[q].imag = tmp_a_i - tmp_b_i;
813 /* 1. iteration */
814 // Note w[0][0]={1,0}
815 __asm__ volatile(
816 "xorps %%xmm1, %%xmm1 \n\t"
817 "xorps %%xmm2, %%xmm2 \n\t"
818 "mov %0, %%"REG_S" \n\t"
819 ASMALIGN(4)
820 "1: \n\t"
821 "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p]
822 "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q]
823 "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p]
824 "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q]
825 "addps %%xmm1, %%xmm0 \n\t"
826 "subps %%xmm2, %%xmm0 \n\t"
827 "movaps %%xmm0, (%%"REG_S")\n\t"
828 "add $16, %%"REG_S" \n\t"
829 "cmp %1, %%"REG_S" \n\t"
830 " jb 1b \n\t"
831 :: "g" (buf), "r" (buf + 128)
832 : "%"REG_S
835 /* 2. iteration */
836 // Note w[1]={{1,0}, {0,-1}}
837 __asm__ volatile(
838 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
839 "mov %0, %%"REG_S" \n\t"
840 ASMALIGN(4)
841 "1: \n\t"
842 "movaps 16(%%"REG_S"), %%xmm2 \n\t" //r2,i2,r3,i3
843 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3
844 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3
845 "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1
846 "movaps (%%"REG_S"), %%xmm1 \n\t" //r0,i0,r1,i1
847 "addps %%xmm2, %%xmm0 \n\t"
848 "subps %%xmm2, %%xmm1 \n\t"
849 "movaps %%xmm0, (%%"REG_S") \n\t"
850 "movaps %%xmm1, 16(%%"REG_S") \n\t"
851 "add $32, %%"REG_S" \n\t"
852 "cmp %1, %%"REG_S" \n\t"
853 " jb 1b \n\t"
854 :: "g" (buf), "r" (buf + 128)
855 : "%"REG_S
858 /* 3. iteration */
860 Note sseW2+0={1,1,sqrt(2),sqrt(2))
861 Note sseW2+16={0,0,sqrt(2),-sqrt(2))
862 Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
863 Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
865 __asm__ volatile(
866 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
867 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
868 "xorps %%xmm5, %%xmm5 \n\t"
869 "xorps %%xmm2, %%xmm2 \n\t"
870 "mov %0, %%"REG_S" \n\t"
871 ASMALIGN(4)
872 "1: \n\t"
873 "movaps 32(%%"REG_S"), %%xmm2 \n\t" //r4,i4,r5,i5
874 "movaps 48(%%"REG_S"), %%xmm3 \n\t" //r6,i6,r7,i7
875 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
876 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
877 "mulps %%xmm2, %%xmm4 \n\t"
878 "mulps %%xmm3, %%xmm5 \n\t"
879 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5
880 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7
881 "mulps %%xmm6, %%xmm3 \n\t"
882 "mulps %%xmm7, %%xmm2 \n\t"
883 "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1
884 "movaps 16(%%"REG_S"), %%xmm1 \n\t" //r2,i2,r3,i3
885 "addps %%xmm4, %%xmm2 \n\t"
886 "addps %%xmm5, %%xmm3 \n\t"
887 "movaps %%xmm2, %%xmm4 \n\t"
888 "movaps %%xmm3, %%xmm5 \n\t"
889 "addps %%xmm0, %%xmm2 \n\t"
890 "addps %%xmm1, %%xmm3 \n\t"
891 "subps %%xmm4, %%xmm0 \n\t"
892 "subps %%xmm5, %%xmm1 \n\t"
893 "movaps %%xmm2, (%%"REG_S") \n\t"
894 "movaps %%xmm3, 16(%%"REG_S") \n\t"
895 "movaps %%xmm0, 32(%%"REG_S") \n\t"
896 "movaps %%xmm1, 48(%%"REG_S") \n\t"
897 "add $64, %%"REG_S" \n\t"
898 "cmp %1, %%"REG_S" \n\t"
899 " jb 1b \n\t"
900 :: "g" (buf), "r" (buf + 128)
901 : "%"REG_S
904 /* 4-7. iterations */
905 for (m=3; m < 7; m++) {
906 two_m = (1 << m);
907 two_m_plus_one = two_m<<1;
908 two_m_plus_one_shl3 = (two_m_plus_one<<3);
909 buf_offset = buf+128;
910 __asm__ volatile(
911 "mov %0, %%"REG_S" \n\t"
912 ASMALIGN(4)
913 "1: \n\t"
914 "xor %%"REG_D", %%"REG_D" \n\t" // k
915 "lea (%%"REG_S", %3), %%"REG_d" \n\t"
916 "2: \n\t"
917 "movaps (%%"REG_d", %%"REG_D"), %%xmm1 \n\t"
918 "movaps (%4, %%"REG_D", 2), %%xmm2 \n\t"
919 "mulps %%xmm1, %%xmm2 \n\t"
920 "shufps $0xB1, %%xmm1, %%xmm1 \n\t"
921 "mulps 16(%4, %%"REG_D", 2), %%xmm1 \n\t"
922 "movaps (%%"REG_S", %%"REG_D"), %%xmm0 \n\t"
923 "addps %%xmm2, %%xmm1 \n\t"
924 "movaps %%xmm1, %%xmm2 \n\t"
925 "addps %%xmm0, %%xmm1 \n\t"
926 "subps %%xmm2, %%xmm0 \n\t"
927 "movaps %%xmm1, (%%"REG_S", %%"REG_D") \n\t"
928 "movaps %%xmm0, (%%"REG_d", %%"REG_D") \n\t"
929 "add $16, %%"REG_D" \n\t"
930 "cmp %3, %%"REG_D" \n\t" //FIXME (opt) count against 0
931 "jb 2b \n\t"
932 "add %2, %%"REG_S" \n\t"
933 "cmp %1, %%"REG_S" \n\t"
934 " jb 1b \n\t"
935 :: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3),
936 "r" (sseW[m])
937 : "%"REG_S, "%"REG_D, "%"REG_d
941 /* Post IFFT complex multiply plus IFFT complex conjugate*/
942 __asm__ volatile(
943 "mov $-1024, %%"REG_S" \n\t"
944 ASMALIGN(4)
945 "1: \n\t"
946 "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
947 "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
948 "shufps $0xB1, %%xmm0, %%xmm0 \n\t"
949 "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t"
950 "mulps 1024+"MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
951 "addps %%xmm1, %%xmm0 \n\t"
952 "movaps %%xmm0, (%0, %%"REG_S") \n\t"
953 "add $16, %%"REG_S" \n\t"
954 " jnz 1b \n\t"
955 :: "r" (buf+128)
956 : "%"REG_S
960 data_ptr = data;
961 delay_ptr = delay;
962 window_ptr = a52_imdct_window;
964 /* Window and convert to real valued signal */
965 __asm__ volatile(
966 "xor %%"REG_D", %%"REG_D" \n\t" // 0
967 "xor %%"REG_S", %%"REG_S" \n\t" // 0
968 "movss %3, %%xmm2 \n\t" // bias
969 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
970 ASMALIGN(4)
971 "1: \n\t"
972 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
973 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ?
974 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ?
975 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ?
976 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
977 "mulps "MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
978 "addps (%2, %%"REG_S"), %%xmm0 \n\t"
979 "addps %%xmm2, %%xmm0 \n\t"
980 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
981 "add $16, %%"REG_S" \n\t"
982 "sub $16, %%"REG_D" \n\t"
983 "cmp $512, %%"REG_S" \n\t"
984 " jb 1b \n\t"
985 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
986 : "%"REG_S, "%"REG_D
988 data_ptr+=128;
989 delay_ptr+=128;
990 // window_ptr+=128;
992 __asm__ volatile(
993 "mov $1024, %%"REG_D" \n\t" // 512
994 "xor %%"REG_S", %%"REG_S" \n\t" // 0
995 "movss %3, %%xmm2 \n\t" // bias
996 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
997 ASMALIGN(4)
998 "1: \n\t"
999 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A
1000 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C
1001 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C
1002 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A
1003 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
1004 "mulps 512+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1005 "addps (%2, %%"REG_S"), %%xmm0 \n\t"
1006 "addps %%xmm2, %%xmm0 \n\t"
1007 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
1008 "add $16, %%"REG_S" \n\t"
1009 "sub $16, %%"REG_D" \n\t"
1010 "cmp $512, %%"REG_S" \n\t"
1011 " jb 1b \n\t"
1012 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1013 : "%"REG_S, "%"REG_D
1015 data_ptr+=128;
1016 // window_ptr+=128;
1018 /* The trailing edge of the window goes into the delay line */
1019 delay_ptr = delay;
1021 __asm__ volatile(
1022 "xor %%"REG_D", %%"REG_D" \n\t" // 0
1023 "xor %%"REG_S", %%"REG_S" \n\t" // 0
1024 ASMALIGN(4)
1025 "1: \n\t"
1026 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A
1027 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C
1028 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C
1029 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A
1030 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
1031 "mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1032 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
1033 "add $16, %%"REG_S" \n\t"
1034 "sub $16, %%"REG_D" \n\t"
1035 "cmp $512, %%"REG_S" \n\t"
1036 " jb 1b \n\t"
1037 :: "r" (buf+64), "r" (delay_ptr)
1038 : "%"REG_S, "%"REG_D
1040 delay_ptr+=128;
1041 // window_ptr-=128;
1043 __asm__ volatile(
1044 "mov $1024, %%"REG_D" \n\t" // 1024
1045 "xor %%"REG_S", %%"REG_S" \n\t" // 0
1046 ASMALIGN(4)
1047 "1: \n\t"
1048 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
1049 "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ?
1050 "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ?
1051 "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ?
1052 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
1053 "mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1054 "movaps %%xmm0, (%1, %%"REG_S") \n\t"
1055 "add $16, %%"REG_S" \n\t"
1056 "sub $16, %%"REG_D" \n\t"
1057 "cmp $512, %%"REG_S" \n\t"
1058 " jb 1b \n\t"
1059 :: "r" (buf), "r" (delay_ptr)
1060 : "%"REG_S, "%"REG_D
1063 #endif // ARCH_X86 || ARCH_X86_64
1065 void a52_imdct_256(sample_t * data, sample_t * delay, sample_t bias)
1067 int i, k;
1068 sample_t t_r, t_i, a_r, a_i, b_r, b_i, c_r, c_i, d_r, d_i, w_1, w_2;
1069 const sample_t * window = a52_imdct_window;
1070 complex_t buf1[64], buf2[64];
1072 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
1073 for (i = 0; i < 64; i++) {
1074 k = fftorder[i];
1075 t_r = pre2[i].real;
1076 t_i = pre2[i].imag;
1078 buf1[i].real = t_i * data[254-k] + t_r * data[k];
1079 buf1[i].imag = t_r * data[254-k] - t_i * data[k];
1081 buf2[i].real = t_i * data[255-k] + t_r * data[k+1];
1082 buf2[i].imag = t_r * data[255-k] - t_i * data[k+1];
1085 ifft64 (buf1);
1086 ifft64 (buf2);
1088 /* Post IFFT complex multiply */
1089 /* Window and convert to real valued signal */
1090 for (i = 0; i < 32; i++) {
1091 /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
1092 t_r = post2[i].real;
1093 t_i = post2[i].imag;
1095 a_r = t_r * buf1[i].real + t_i * buf1[i].imag;
1096 a_i = t_i * buf1[i].real - t_r * buf1[i].imag;
1097 b_r = t_i * buf1[63-i].real + t_r * buf1[63-i].imag;
1098 b_i = t_r * buf1[63-i].real - t_i * buf1[63-i].imag;
1100 c_r = t_r * buf2[i].real + t_i * buf2[i].imag;
1101 c_i = t_i * buf2[i].real - t_r * buf2[i].imag;
1102 d_r = t_i * buf2[63-i].real + t_r * buf2[63-i].imag;
1103 d_i = t_r * buf2[63-i].real - t_i * buf2[63-i].imag;
1105 w_1 = window[2*i];
1106 w_2 = window[255-2*i];
1107 data[2*i] = delay[2*i] * w_2 - a_r * w_1 + bias;
1108 data[255-2*i] = delay[2*i] * w_1 + a_r * w_2 + bias;
1109 delay[2*i] = c_i;
1111 w_1 = window[128+2*i];
1112 w_2 = window[127-2*i];
1113 data[128+2*i] = delay[127-2*i] * w_2 + a_i * w_1 + bias;
1114 data[127-2*i] = delay[127-2*i] * w_1 - a_i * w_2 + bias;
1115 delay[127-2*i] = c_r;
1117 w_1 = window[2*i+1];
1118 w_2 = window[254-2*i];
1119 data[2*i+1] = delay[2*i+1] * w_2 - b_i * w_1 + bias;
1120 data[254-2*i] = delay[2*i+1] * w_1 + b_i * w_2 + bias;
1121 delay[2*i+1] = d_r;
1123 w_1 = window[129+2*i];
1124 w_2 = window[126-2*i];
1125 data[129+2*i] = delay[126-2*i] * w_2 + b_r * w_1 + bias;
1126 data[126-2*i] = delay[126-2*i] * w_1 - b_r * w_2 + bias;
1127 delay[126-2*i] = d_i;
1131 static double besselI0 (double x)
1133 double bessel = 1;
1134 int i = 100;
1137 bessel = bessel * x / (i * i) + 1;
1138 while (--i);
1139 return bessel;
1142 void a52_imdct_init (uint32_t mm_accel)
1144 int i, j, k;
1145 double sum;
1147 /* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */
1148 sum = 0;
1149 for (i = 0; i < 256; i++) {
1150 sum += besselI0 (i * (256 - i) * (5 * M_PI / 256) * (5 * M_PI / 256));
1151 a52_imdct_window[i] = sum;
1153 sum++;
1154 for (i = 0; i < 256; i++)
1155 a52_imdct_window[i] = sqrt (a52_imdct_window[i] / sum);
1157 for (i = 0; i < 3; i++)
1158 roots16[i] = cos ((M_PI / 8) * (i + 1));
1160 for (i = 0; i < 7; i++)
1161 roots32[i] = cos ((M_PI / 16) * (i + 1));
1163 for (i = 0; i < 15; i++)
1164 roots64[i] = cos ((M_PI / 32) * (i + 1));
1166 for (i = 0; i < 31; i++)
1167 roots128[i] = cos ((M_PI / 64) * (i + 1));
1169 for (i = 0; i < 64; i++) {
1170 k = fftorder[i] / 2 + 64;
1171 pre1[i].real = cos ((M_PI / 256) * (k - 0.25));
1172 pre1[i].imag = sin ((M_PI / 256) * (k - 0.25));
1175 for (i = 64; i < 128; i++) {
1176 k = fftorder[i] / 2 + 64;
1177 pre1[i].real = -cos ((M_PI / 256) * (k - 0.25));
1178 pre1[i].imag = -sin ((M_PI / 256) * (k - 0.25));
1181 for (i = 0; i < 64; i++) {
1182 post1[i].real = cos ((M_PI / 256) * (i + 0.5));
1183 post1[i].imag = sin ((M_PI / 256) * (i + 0.5));
1186 for (i = 0; i < 64; i++) {
1187 k = fftorder[i] / 4;
1188 pre2[i].real = cos ((M_PI / 128) * (k - 0.25));
1189 pre2[i].imag = sin ((M_PI / 128) * (k - 0.25));
1192 for (i = 0; i < 32; i++) {
1193 post2[i].real = cos ((M_PI / 128) * (i + 0.5));
1194 post2[i].imag = sin ((M_PI / 128) * (i + 0.5));
1196 for (i = 0; i < 128; i++) {
1197 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
1198 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
1200 for (i = 0; i < 7; i++) {
1201 j = 1 << i;
1202 for (k = 0; k < j; k++) {
1203 w[i][k].real = cos (-M_PI * k / j);
1204 w[i][k].imag = sin (-M_PI * k / j);
1207 #if ARCH_X86 || ARCH_X86_64
1208 for (i = 0; i < 128; i++) {
1209 sseSinCos1c[2*i+0]= xcos1[i];
1210 sseSinCos1c[2*i+1]= -xcos1[i];
1211 sseSinCos1d[2*i+0]= xsin1[i];
1212 sseSinCos1d[2*i+1]= xsin1[i];
1214 for (i = 1; i < 7; i++) {
1215 j = 1 << i;
1216 for (k = 0; k < j; k+=2) {
1218 sseW[i][4*k + 0] = w[i][k+0].real;
1219 sseW[i][4*k + 1] = w[i][k+0].real;
1220 sseW[i][4*k + 2] = w[i][k+1].real;
1221 sseW[i][4*k + 3] = w[i][k+1].real;
1223 sseW[i][4*k + 4] = -w[i][k+0].imag;
1224 sseW[i][4*k + 5] = w[i][k+0].imag;
1225 sseW[i][4*k + 6] = -w[i][k+1].imag;
1226 sseW[i][4*k + 7] = w[i][k+1].imag;
1228 //we multiply more or less uninitalized numbers so we need to use exactly 0.0
1229 if(k==0)
1231 // sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
1232 sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
1235 if(2*k == j)
1237 sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
1238 // sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
1243 for(i=0; i<128; i++)
1245 sseWindow[2*i+0]= -a52_imdct_window[2*i+0];
1246 sseWindow[2*i+1]= a52_imdct_window[2*i+1];
1249 for(i=0; i<64; i++)
1251 sseWindow[256 + 2*i+0]= -a52_imdct_window[254 - 2*i+1];
1252 sseWindow[256 + 2*i+1]= a52_imdct_window[254 - 2*i+0];
1253 sseWindow[384 + 2*i+0]= a52_imdct_window[126 - 2*i+1];
1254 sseWindow[384 + 2*i+1]= -a52_imdct_window[126 - 2*i+0];
1256 #endif
1257 a52_imdct_512 = imdct_do_512;
1258 ifft128 = ifft128_c;
1259 ifft64 = ifft64_c;
1261 #if ARCH_X86 || ARCH_X86_64
1262 if(mm_accel & MM_ACCEL_X86_SSE)
1264 fprintf (stderr, "Using SSE optimized IMDCT transform\n");
1265 a52_imdct_512 = imdct_do_512_sse;
1267 else
1268 if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
1270 fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
1271 a52_imdct_512 = imdct_do_512_3dnowex;
1273 else
1274 if(mm_accel & MM_ACCEL_X86_3DNOW)
1276 fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
1277 a52_imdct_512 = imdct_do_512_3dnow;
1279 else
1280 #endif // ARCH_X86 || ARCH_X86_64
1281 #if HAVE_ALTIVEC
1282 if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
1284 fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
1285 a52_imdct_512 = imdct_do_512_altivec;
1287 else
1288 #endif
1290 #ifdef LIBA52_DJBFFT
1291 if (mm_accel & MM_ACCEL_DJBFFT) {
1292 fprintf (stderr, "Using djbfft for IMDCT transform\n");
1293 ifft128 = (void (*) (complex_t *)) fftc4_un128;
1294 ifft64 = (void (*) (complex_t *)) fftc4_un64;
1295 } else
1296 #endif
1298 fprintf (stderr, "No accelerated IMDCT transform found\n");