3 * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
6 * The ifft algorithms in this file have been largely inspired by Dan
7 * Bernstein's work, djbfft, available at http://cr.yp.to/djbfft.html
9 * This file is part of a52dec, a free ATSC A-52 stream decoder.
10 * See http://liba52.sourceforge.net/ for updates.
12 * Modified for use with MPlayer, changes contained in liba52_changes.diff.
13 * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/
16 * a52dec is free software; you can redistribute it and/or modify
17 * it under the terms of the GNU General Public License as published by
18 * the Free Software Foundation; either version 2 of the License, or
19 * (at your option) any later version.
21 * a52dec is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU General Public License for more details.
26 * You should have received a copy of the GNU General Public License
27 * along with this program; if not, write to the Free Software
28 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
31 * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
32 * michael did port them from libac3 (untested, perhaps totally broken)
33 * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
44 #define M_PI 3.1415926535897932384626433832795029
49 #include "a52_internal.h"
53 void (*a52_imdct_512
) (sample_t
* data
, sample_t
* delay
, sample_t bias
);
55 #ifdef RUNTIME_CPUDETECT
56 #undef HAVE_AMD3DNOWEXT
57 #define HAVE_AMD3DNOWEXT 0
60 typedef struct complex_s
{
65 static const int pm128
[128] attribute_used
__attribute__((aligned(16))) =
67 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
68 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,
69 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,
70 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,
71 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,
72 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
73 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
74 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
77 static uint8_t attribute_used bit_reverse_512
[] = {
78 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
79 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
80 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
81 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
82 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
83 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
84 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
85 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
86 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
87 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
88 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
89 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
90 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
91 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
92 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
93 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
95 static uint8_t fftorder
[] = {
96 0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176,
97 8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88,
98 4,132, 68,196, 36,164,228,100, 20,148, 84,212,244,116, 52,180,
99 252,124, 60,188, 28,156,220, 92, 12,140, 76,204,236,108, 44,172,
100 2,130, 66,194, 34,162,226, 98, 18,146, 82,210,242,114, 50,178,
101 10,138, 74,202, 42,170,234,106,250,122, 58,186, 26,154,218, 90,
102 254,126, 62,190, 30,158,222, 94, 14,142, 78,206,238,110, 46,174,
103 6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86
106 static complex_t
__attribute__((aligned(16))) buf
[128];
108 /* Twiddle factor LUT */
109 static complex_t
__attribute__((aligned(16))) w_1
[1];
110 static complex_t
__attribute__((aligned(16))) w_2
[2];
111 static complex_t
__attribute__((aligned(16))) w_4
[4];
112 static complex_t
__attribute__((aligned(16))) w_8
[8];
113 static complex_t
__attribute__((aligned(16))) w_16
[16];
114 static complex_t
__attribute__((aligned(16))) w_32
[32];
115 static complex_t
__attribute__((aligned(16))) w_64
[64];
116 static complex_t
__attribute__((aligned(16))) * w
[7] = {w_1
, w_2
, w_4
, w_8
, w_16
, w_32
, w_64
};
118 /* Twiddle factors for IMDCT */
119 static sample_t
__attribute__((aligned(16))) xcos1
[128];
120 static sample_t
__attribute__((aligned(16))) xsin1
[128];
122 #if ARCH_X86 || ARCH_X86_64
123 // NOTE: SSE needs 16byte alignment or it will segfault
125 static float __attribute__((aligned(16))) sseSinCos1c
[256];
126 static float __attribute__((aligned(16))) sseSinCos1d
[256];
127 static float attribute_used
__attribute__((aligned(16))) ps111_1
[4]={1,1,1,-1};
128 //static float __attribute__((aligned(16))) sseW0[4];
129 static float __attribute__((aligned(16))) sseW1
[8];
130 static float __attribute__((aligned(16))) sseW2
[16];
131 static float __attribute__((aligned(16))) sseW3
[32];
132 static float __attribute__((aligned(16))) sseW4
[64];
133 static float __attribute__((aligned(16))) sseW5
[128];
134 static float __attribute__((aligned(16))) sseW6
[256];
135 static float __attribute__((aligned(16))) *sseW
[7]=
136 {NULL
/*sseW0*/,sseW1
,sseW2
,sseW3
,sseW4
,sseW5
,sseW6
};
137 static float __attribute__((aligned(16))) sseWindow
[512];
140 /* Root values for IFFT */
141 static sample_t roots16
[3];
142 static sample_t roots32
[7];
143 static sample_t roots64
[15];
144 static sample_t roots128
[31];
146 /* Twiddle factors for IMDCT */
147 static complex_t pre1
[128];
148 static complex_t post1
[64];
149 static complex_t pre2
[64];
150 static complex_t post2
[32];
152 static sample_t a52_imdct_window
[256];
154 static void (* ifft128
) (complex_t
* buf
);
155 static void (* ifft64
) (complex_t
* buf
);
157 static inline void ifft2 (complex_t
* buf
)
163 buf
[0].real
+= buf
[1].real
;
164 buf
[0].imag
+= buf
[1].imag
;
165 buf
[1].real
= r
- buf
[1].real
;
166 buf
[1].imag
= i
- buf
[1].imag
;
169 static inline void ifft4 (complex_t
* buf
)
171 double tmp1
, tmp2
, tmp3
, tmp4
, tmp5
, tmp6
, tmp7
, tmp8
;
173 tmp1
= buf
[0].real
+ buf
[1].real
;
174 tmp2
= buf
[3].real
+ buf
[2].real
;
175 tmp3
= buf
[0].imag
+ buf
[1].imag
;
176 tmp4
= buf
[2].imag
+ buf
[3].imag
;
177 tmp5
= buf
[0].real
- buf
[1].real
;
178 tmp6
= buf
[0].imag
- buf
[1].imag
;
179 tmp7
= buf
[2].imag
- buf
[3].imag
;
180 tmp8
= buf
[3].real
- buf
[2].real
;
182 buf
[0].real
= tmp1
+ tmp2
;
183 buf
[0].imag
= tmp3
+ tmp4
;
184 buf
[2].real
= tmp1
- tmp2
;
185 buf
[2].imag
= tmp3
- tmp4
;
186 buf
[1].real
= tmp5
+ tmp7
;
187 buf
[1].imag
= tmp6
+ tmp8
;
188 buf
[3].real
= tmp5
- tmp7
;
189 buf
[3].imag
= tmp6
- tmp8
;
192 /* the basic split-radix ifft butterfly */
194 #define BUTTERFLY(a0,a1,a2,a3,wr,wi) do { \
195 tmp5 = a2.real * wr + a2.imag * wi; \
196 tmp6 = a2.imag * wr - a2.real * wi; \
197 tmp7 = a3.real * wr - a3.imag * wi; \
198 tmp8 = a3.imag * wr + a3.real * wi; \
199 tmp1 = tmp5 + tmp7; \
200 tmp2 = tmp6 + tmp8; \
201 tmp3 = tmp6 - tmp8; \
202 tmp4 = tmp7 - tmp5; \
203 a2.real = a0.real - tmp1; \
204 a2.imag = a0.imag - tmp2; \
205 a3.real = a1.real - tmp3; \
206 a3.imag = a1.imag - tmp4; \
213 /* split-radix ifft butterfly, specialized for wr=1 wi=0 */
215 #define BUTTERFLY_ZERO(a0,a1,a2,a3) do { \
216 tmp1 = a2.real + a3.real; \
217 tmp2 = a2.imag + a3.imag; \
218 tmp3 = a2.imag - a3.imag; \
219 tmp4 = a3.real - a2.real; \
220 a2.real = a0.real - tmp1; \
221 a2.imag = a0.imag - tmp2; \
222 a3.real = a1.real - tmp3; \
223 a3.imag = a1.imag - tmp4; \
230 /* split-radix ifft butterfly, specialized for wr=wi */
232 #define BUTTERFLY_HALF(a0,a1,a2,a3,w) do { \
233 tmp5 = (a2.real + a2.imag) * w; \
234 tmp6 = (a2.imag - a2.real) * w; \
235 tmp7 = (a3.real - a3.imag) * w; \
236 tmp8 = (a3.imag + a3.real) * w; \
237 tmp1 = tmp5 + tmp7; \
238 tmp2 = tmp6 + tmp8; \
239 tmp3 = tmp6 - tmp8; \
240 tmp4 = tmp7 - tmp5; \
241 a2.real = a0.real - tmp1; \
242 a2.imag = a0.imag - tmp2; \
243 a3.real = a1.real - tmp3; \
244 a3.imag = a1.imag - tmp4; \
251 static inline void ifft8 (complex_t
* buf
)
253 double tmp1
, tmp2
, tmp3
, tmp4
, tmp5
, tmp6
, tmp7
, tmp8
;
258 BUTTERFLY_ZERO (buf
[0], buf
[2], buf
[4], buf
[6]);
259 BUTTERFLY_HALF (buf
[1], buf
[3], buf
[5], buf
[7], roots16
[1]);
262 static void ifft_pass (complex_t
* buf
, sample_t
* weight
, int n
)
267 double tmp1
, tmp2
, tmp3
, tmp4
, tmp5
, tmp6
, tmp7
, tmp8
;
275 BUTTERFLY_ZERO (buf
[-1], buf1
[-1], buf2
[-1], buf3
[-1]);
280 BUTTERFLY (buf
[0], buf1
[0], buf2
[0], buf3
[0], weight
[n
], weight
[2*i
]);
289 static void ifft16 (complex_t
* buf
)
294 ifft_pass (buf
, roots16
- 4, 4);
297 static void ifft32 (complex_t
* buf
)
302 ifft_pass (buf
, roots32
- 8, 8);
305 static void ifft64_c (complex_t
* buf
)
310 ifft_pass (buf
, roots64
- 16, 16);
313 static void ifft128_c (complex_t
* buf
)
318 ifft_pass (buf
, roots64
- 16, 16);
322 ifft_pass (buf
, roots128
- 32, 32);
325 void imdct_do_512 (sample_t
* data
, sample_t
* delay
, sample_t bias
)
328 sample_t t_r
, t_i
, a_r
, a_i
, b_r
, b_i
, w_1
, w_2
;
329 const sample_t
* window
= a52_imdct_window
;
332 for (i
= 0; i
< 128; i
++) {
337 buf
[i
].real
= t_i
* data
[255-k
] + t_r
* data
[k
];
338 buf
[i
].imag
= t_r
* data
[255-k
] - t_i
* data
[k
];
343 /* Post IFFT complex multiply plus IFFT complex conjugate*/
344 /* Window and convert to real valued signal */
345 for (i
= 0; i
< 64; i
++) {
346 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
350 a_r
= t_r
* buf
[i
].real
+ t_i
* buf
[i
].imag
;
351 a_i
= t_i
* buf
[i
].real
- t_r
* buf
[i
].imag
;
352 b_r
= t_i
* buf
[127-i
].real
+ t_r
* buf
[127-i
].imag
;
353 b_i
= t_r
* buf
[127-i
].real
- t_i
* buf
[127-i
].imag
;
356 w_2
= window
[255-2*i
];
357 data
[2*i
] = delay
[2*i
] * w_2
- a_r
* w_1
+ bias
;
358 data
[255-2*i
] = delay
[2*i
] * w_1
+ a_r
* w_2
+ bias
;
362 w_2
= window
[254-2*i
];
363 data
[2*i
+1] = delay
[2*i
+1] * w_2
+ b_r
* w_1
+ bias
;
364 data
[254-2*i
] = delay
[2*i
+1] * w_1
- b_r
* w_2
+ bias
;
371 #ifdef HAVE_ALTIVEC_H
375 // used to build registers permutation vectors (vcprm)
376 // the 's' are for words in the _s_econd vector
377 #define WORD_0 0x00,0x01,0x02,0x03
378 #define WORD_1 0x04,0x05,0x06,0x07
379 #define WORD_2 0x08,0x09,0x0a,0x0b
380 #define WORD_3 0x0c,0x0d,0x0e,0x0f
381 #define WORD_s0 0x10,0x11,0x12,0x13
382 #define WORD_s1 0x14,0x15,0x16,0x17
383 #define WORD_s2 0x18,0x19,0x1a,0x1b
384 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
386 #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
387 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
389 #define FOUROF(a) {a,a,a,a}
391 // vcprmle is used to keep the same index as in the SSE version.
392 // it's the same as vcprm, with the index inversed
393 // ('le' is Little Endian)
394 #define vcprmle(a,b,c,d) vcprm(d,c,b,a)
396 // used to build inverse/identity vectors (vcii)
397 // n is _n_egative, p is _p_ositive
403 imdct_do_512_altivec(sample_t data
[],sample_t delay
[], sample_t bias
)
419 sample_t
*window_ptr
;
421 /* 512 IMDCT with source and dest data in 'data' */
423 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
424 for( i
=0; i
< 128; i
++) {
425 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
426 int j
= bit_reverse_512
[i
];
427 buf
[i
].real
= (data
[256-2*j
-1] * xcos1
[j
]) - (data
[2*j
] * xsin1
[j
]);
428 buf
[i
].imag
= -1.0 * ((data
[2*j
] * xcos1
[j
]) + (data
[256-2*j
-1] * xsin1
[j
]));
432 for(i
= 0; i
< 128; i
+= 2) {
434 tmp_a_r
= buf
[i
].real
;
435 tmp_a_i
= buf
[i
].imag
;
436 tmp_b_r
= buf
[i
+1].real
;
437 tmp_b_i
= buf
[i
+1].imag
;
438 buf
[i
].real
= tmp_a_r
+ tmp_b_r
;
439 buf
[i
].imag
= tmp_a_i
+ tmp_b_i
;
440 buf
[i
+1].real
= tmp_a_r
- tmp_b_r
;
441 buf
[i
+1].imag
= tmp_a_i
- tmp_b_i
;
443 vector
float temp
, bufv
;
445 bufv
= vec_ld(i
<< 3, (float*)buf
);
446 temp
= vec_perm(bufv
, bufv
, vcprm(2,3,0,1));
447 bufv
= vec_madd(bufv
, vcii(p
,p
,n
,n
), temp
);
448 vec_st(bufv
, i
<< 3, (float*)buf
);
453 // Note w[1]={{1,0}, {0,-1}}
454 for(i
= 0; i
< 128; i
+= 4) {
456 tmp_a_r
= buf
[i
].real
;
457 tmp_a_i
= buf
[i
].imag
;
458 tmp_b_r
= buf
[i
+2].real
;
459 tmp_b_i
= buf
[i
+2].imag
;
460 buf
[i
].real
= tmp_a_r
+ tmp_b_r
;
461 buf
[i
].imag
= tmp_a_i
+ tmp_b_i
;
462 buf
[i
+2].real
= tmp_a_r
- tmp_b_r
;
463 buf
[i
+2].imag
= tmp_a_i
- tmp_b_i
;
464 tmp_a_r
= buf
[i
+1].real
;
465 tmp_a_i
= buf
[i
+1].imag
;
466 /* WARNING: im <-> re here ! */
467 tmp_b_r
= buf
[i
+3].imag
;
468 tmp_b_i
= buf
[i
+3].real
;
469 buf
[i
+1].real
= tmp_a_r
+ tmp_b_r
;
470 buf
[i
+1].imag
= tmp_a_i
- tmp_b_i
;
471 buf
[i
+3].real
= tmp_a_r
- tmp_b_r
;
472 buf
[i
+3].imag
= tmp_a_i
+ tmp_b_i
;
474 vector
float buf01
, buf23
, temp1
, temp2
;
476 buf01
= vec_ld((i
+ 0) << 3, (float*)buf
);
477 buf23
= vec_ld((i
+ 2) << 3, (float*)buf
);
478 buf23
= vec_perm(buf23
,buf23
,vcprm(0,1,3,2));
480 temp1
= vec_madd(buf23
, vcii(p
,p
,p
,n
), buf01
);
481 temp2
= vec_madd(buf23
, vcii(n
,n
,n
,p
), buf01
);
483 vec_st(temp1
, (i
+ 0) << 3, (float*)buf
);
484 vec_st(temp2
, (i
+ 2) << 3, (float*)buf
);
489 for(i
= 0; i
< 128; i
+= 8) {
491 tmp_a_r
= buf
[i
].real
;
492 tmp_a_i
= buf
[i
].imag
;
493 tmp_b_r
= buf
[i
+4].real
;
494 tmp_b_i
= buf
[i
+4].imag
;
495 buf
[i
].real
= tmp_a_r
+ tmp_b_r
;
496 buf
[i
].imag
= tmp_a_i
+ tmp_b_i
;
497 buf
[i
+4].real
= tmp_a_r
- tmp_b_r
;
498 buf
[i
+4].imag
= tmp_a_i
- tmp_b_i
;
499 tmp_a_r
= buf
[1+i
].real
;
500 tmp_a_i
= buf
[1+i
].imag
;
501 tmp_b_r
= (buf
[i
+5].real
+ buf
[i
+5].imag
) * w
[2][1].real
;
502 tmp_b_i
= (buf
[i
+5].imag
- buf
[i
+5].real
) * w
[2][1].real
;
503 buf
[1+i
].real
= tmp_a_r
+ tmp_b_r
;
504 buf
[1+i
].imag
= tmp_a_i
+ tmp_b_i
;
505 buf
[i
+5].real
= tmp_a_r
- tmp_b_r
;
506 buf
[i
+5].imag
= tmp_a_i
- tmp_b_i
;
507 tmp_a_r
= buf
[i
+2].real
;
508 tmp_a_i
= buf
[i
+2].imag
;
509 /* WARNING re <-> im & sign */
510 tmp_b_r
= buf
[i
+6].imag
;
511 tmp_b_i
= - buf
[i
+6].real
;
512 buf
[i
+2].real
= tmp_a_r
+ tmp_b_r
;
513 buf
[i
+2].imag
= tmp_a_i
+ tmp_b_i
;
514 buf
[i
+6].real
= tmp_a_r
- tmp_b_r
;
515 buf
[i
+6].imag
= tmp_a_i
- tmp_b_i
;
516 tmp_a_r
= buf
[i
+3].real
;
517 tmp_a_i
= buf
[i
+3].imag
;
518 tmp_b_r
= (buf
[i
+7].real
- buf
[i
+7].imag
) * w
[2][3].imag
;
519 tmp_b_i
= (buf
[i
+7].imag
+ buf
[i
+7].real
) * w
[2][3].imag
;
520 buf
[i
+3].real
= tmp_a_r
+ tmp_b_r
;
521 buf
[i
+3].imag
= tmp_a_i
+ tmp_b_i
;
522 buf
[i
+7].real
= tmp_a_r
- tmp_b_r
;
523 buf
[i
+7].imag
= tmp_a_i
- tmp_b_i
;
525 vector
float buf01
, buf23
, buf45
, buf67
;
527 buf01
= vec_ld((i
+ 0) << 3, (float*)buf
);
528 buf23
= vec_ld((i
+ 2) << 3, (float*)buf
);
530 tmp_b_r
= (buf
[i
+5].real
+ buf
[i
+5].imag
) * w
[2][1].real
;
531 tmp_b_i
= (buf
[i
+5].imag
- buf
[i
+5].real
) * w
[2][1].real
;
532 buf
[i
+5].real
= tmp_b_r
;
533 buf
[i
+5].imag
= tmp_b_i
;
534 tmp_b_r
= (buf
[i
+7].real
- buf
[i
+7].imag
) * w
[2][3].imag
;
535 tmp_b_i
= (buf
[i
+7].imag
+ buf
[i
+7].real
) * w
[2][3].imag
;
536 buf
[i
+7].real
= tmp_b_r
;
537 buf
[i
+7].imag
= tmp_b_i
;
539 buf23
= vec_ld((i
+ 2) << 3, (float*)buf
);
540 buf45
= vec_ld((i
+ 4) << 3, (float*)buf
);
541 buf67
= vec_ld((i
+ 6) << 3, (float*)buf
);
542 buf67
= vec_perm(buf67
, buf67
, vcprm(1,0,2,3));
544 vec_st(vec_add(buf01
, buf45
), (i
+ 0) << 3, (float*)buf
);
545 vec_st(vec_madd(buf67
, vcii(p
,n
,p
,p
), buf23
), (i
+ 2) << 3, (float*)buf
);
546 vec_st(vec_sub(buf01
, buf45
), (i
+ 4) << 3, (float*)buf
);
547 vec_st(vec_nmsub(buf67
, vcii(p
,n
,p
,p
), buf23
), (i
+ 6) << 3, (float*)buf
);
551 /* 4-7. iterations */
552 for (m
=3; m
< 7; m
++) {
555 two_m_plus_one
= two_m
<<1;
557 for(i
= 0; i
< 128; i
+= two_m_plus_one
) {
558 for(k
= 0; k
< two_m
; k
+=2) {
562 tmp_a_r
= buf
[p
].real
;
563 tmp_a_i
= buf
[p
].imag
;
565 buf
[q
].real
* w
[m
][k
].real
-
566 buf
[q
].imag
* w
[m
][k
].imag
;
568 buf
[q
].imag
* w
[m
][k
].real
+
569 buf
[q
].real
* w
[m
][k
].imag
;
570 buf
[p
].real
= tmp_a_r
+ tmp_b_r
;
571 buf
[p
].imag
= tmp_a_i
+ tmp_b_i
;
572 buf
[q
].real
= tmp_a_r
- tmp_b_r
;
573 buf
[q
].imag
= tmp_a_i
- tmp_b_i
;
575 tmp_a_r
= buf
[(p
+ 1)].real
;
576 tmp_a_i
= buf
[(p
+ 1)].imag
;
578 buf
[(q
+ 1)].real
* w
[m
][(k
+ 1)].real
-
579 buf
[(q
+ 1)].imag
* w
[m
][(k
+ 1)].imag
;
581 buf
[(q
+ 1)].imag
* w
[m
][(k
+ 1)].real
+
582 buf
[(q
+ 1)].real
* w
[m
][(k
+ 1)].imag
;
583 buf
[(p
+ 1)].real
= tmp_a_r
+ tmp_b_r
;
584 buf
[(p
+ 1)].imag
= tmp_a_i
+ tmp_b_i
;
585 buf
[(q
+ 1)].real
= tmp_a_r
- tmp_b_r
;
586 buf
[(q
+ 1)].imag
= tmp_a_i
- tmp_b_i
;
590 vector
float vecp
, vecq
, vecw
, temp1
, temp2
, temp3
, temp4
;
591 const vector
float vczero
= (const vector
float)FOUROF(0.);
592 // first compute buf[q] and buf[q+1]
593 vecq
= vec_ld(q
<< 3, (float*)buf
);
594 vecw
= vec_ld(0, (float*)&(w
[m
][k
]));
595 temp1
= vec_madd(vecq
, vecw
, vczero
);
596 temp2
= vec_perm(vecq
, vecq
, vcprm(1,0,3,2));
597 temp2
= vec_madd(temp2
, vecw
, vczero
);
598 temp3
= vec_perm(temp1
, temp2
, vcprm(0,s0
,2,s2
));
599 temp4
= vec_perm(temp1
, temp2
, vcprm(1,s1
,3,s3
));
600 vecq
= vec_madd(temp4
, vcii(n
,p
,n
,p
), temp3
);
601 // then butterfly with buf[p] and buf[p+1]
602 vecp
= vec_ld(p
<< 3, (float*)buf
);
604 temp1
= vec_add(vecp
, vecq
);
605 temp2
= vec_sub(vecp
, vecq
);
607 vec_st(temp1
, p
<< 3, (float*)buf
);
608 vec_st(temp2
, q
<< 3, (float*)buf
);
614 /* Post IFFT complex multiply plus IFFT complex conjugate*/
615 for( i
=0; i
< 128; i
+=4) {
616 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
618 tmp_a_r
= buf
[(i
+ 0)].real
;
619 tmp_a_i
= -1.0 * buf
[(i
+ 0)].imag
;
621 (tmp_a_r
* xcos1
[(i
+ 0)]) - (tmp_a_i
* xsin1
[(i
+ 0)]);
623 (tmp_a_r
* xsin1
[(i
+ 0)]) + (tmp_a_i
* xcos1
[(i
+ 0)]);
625 tmp_a_r
= buf
[(i
+ 1)].real
;
626 tmp_a_i
= -1.0 * buf
[(i
+ 1)].imag
;
628 (tmp_a_r
* xcos1
[(i
+ 1)]) - (tmp_a_i
* xsin1
[(i
+ 1)]);
630 (tmp_a_r
* xsin1
[(i
+ 1)]) + (tmp_a_i
* xcos1
[(i
+ 1)]);
632 tmp_a_r
= buf
[(i
+ 2)].real
;
633 tmp_a_i
= -1.0 * buf
[(i
+ 2)].imag
;
635 (tmp_a_r
* xcos1
[(i
+ 2)]) - (tmp_a_i
* xsin1
[(i
+ 2)]);
637 (tmp_a_r
* xsin1
[(i
+ 2)]) + (tmp_a_i
* xcos1
[(i
+ 2)]);
639 tmp_a_r
= buf
[(i
+ 3)].real
;
640 tmp_a_i
= -1.0 * buf
[(i
+ 3)].imag
;
642 (tmp_a_r
* xcos1
[(i
+ 3)]) - (tmp_a_i
* xsin1
[(i
+ 3)]);
644 (tmp_a_r
* xsin1
[(i
+ 3)]) + (tmp_a_i
* xcos1
[(i
+ 3)]);
646 vector
float bufv_0
, bufv_2
, cosv
, sinv
, temp1
, temp2
;
647 vector
float temp0022
, temp1133
, tempCS01
;
648 const vector
float vczero
= (const vector
float)FOUROF(0.);
650 bufv_0
= vec_ld((i
+ 0) << 3, (float*)buf
);
651 bufv_2
= vec_ld((i
+ 2) << 3, (float*)buf
);
653 cosv
= vec_ld(i
<< 2, xcos1
);
654 sinv
= vec_ld(i
<< 2, xsin1
);
656 temp0022
= vec_perm(bufv_0
, bufv_0
, vcprm(0,0,2,2));
657 temp1133
= vec_perm(bufv_0
, bufv_0
, vcprm(1,1,3,3));
658 tempCS01
= vec_perm(cosv
, sinv
, vcprm(0,s0
,1,s1
));
659 temp1
= vec_madd(temp0022
, tempCS01
, vczero
);
660 tempCS01
= vec_perm(cosv
, sinv
, vcprm(s0
,0,s1
,1));
661 temp2
= vec_madd(temp1133
, tempCS01
, vczero
);
662 bufv_0
= vec_madd(temp2
, vcii(p
,n
,p
,n
), temp1
);
664 vec_st(bufv_0
, (i
+ 0) << 3, (float*)buf
);
666 /* idem with bufv_2 and high-order cosv/sinv */
668 temp0022
= vec_perm(bufv_2
, bufv_2
, vcprm(0,0,2,2));
669 temp1133
= vec_perm(bufv_2
, bufv_2
, vcprm(1,1,3,3));
670 tempCS01
= vec_perm(cosv
, sinv
, vcprm(2,s2
,3,s3
));
671 temp1
= vec_madd(temp0022
, tempCS01
, vczero
);
672 tempCS01
= vec_perm(cosv
, sinv
, vcprm(s2
,2,s3
,3));
673 temp2
= vec_madd(temp1133
, tempCS01
, vczero
);
674 bufv_2
= vec_madd(temp2
, vcii(p
,n
,p
,n
), temp1
);
676 vec_st(bufv_2
, (i
+ 2) << 3, (float*)buf
);
683 window_ptr
= a52_imdct_window
;
685 /* Window and convert to real valued signal */
686 for(i
=0; i
< 64; i
++) {
687 *data_ptr
++ = -buf
[64+i
].imag
* *window_ptr
++ + *delay_ptr
++ + bias
;
688 *data_ptr
++ = buf
[64-i
-1].real
* *window_ptr
++ + *delay_ptr
++ + bias
;
691 for(i
=0; i
< 64; i
++) {
692 *data_ptr
++ = -buf
[i
].real
* *window_ptr
++ + *delay_ptr
++ + bias
;
693 *data_ptr
++ = buf
[128-i
-1].imag
* *window_ptr
++ + *delay_ptr
++ + bias
;
696 /* The trailing edge of the window goes into the delay line */
699 for(i
=0; i
< 64; i
++) {
700 *delay_ptr
++ = -buf
[64+i
].real
* *--window_ptr
;
701 *delay_ptr
++ = buf
[64-i
-1].imag
* *--window_ptr
;
704 for(i
=0; i
<64; i
++) {
705 *delay_ptr
++ = buf
[i
].imag
* *--window_ptr
;
706 *delay_ptr
++ = -buf
[128-i
-1].real
* *--window_ptr
;
712 // Stuff below this line is borrowed from libac3
714 #if ARCH_X86 || ARCH_X86_64
716 #define HAVE_AMD3DNOW 1
717 #include "srfftp_3dnow.h"
719 const i_cmplx_t x_plus_minus_3dnow
__attribute__ ((aligned (8))) = {{ 0x00000000UL
, 0x80000000UL
}};
720 const i_cmplx_t x_minus_plus_3dnow
__attribute__ ((aligned (8))) = {{ 0x80000000UL
, 0x00000000UL
}};
721 const complex_t HSQRT2_3DNOW
__attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
723 #undef HAVE_AMD3DNOWEXT
724 #define HAVE_AMD3DNOWEXT 0
725 #include "imdct_3dnow.h"
726 #undef HAVE_AMD3DNOWEXT
727 #define HAVE_AMD3DNOWEXT 1
728 #include "imdct_3dnow.h"
731 imdct_do_512_sse(sample_t data
[],sample_t delay
[], sample_t bias
)
738 long two_m_plus_one_shl3
;
739 complex_t
*buf_offset
;
748 sample_t
*window_ptr
;
750 /* 512 IMDCT with source and dest data in 'data' */
751 /* see the c version (dct_do_512()), its allmost identical, just in C */
753 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
754 /* Bit reversed shuffling */
756 "xor %%"REG_S
", %%"REG_S
" \n\t"
757 "lea "MANGLE(bit_reverse_512
)", %%"REG_a
"\n\t"
758 "mov $1008, %%"REG_D
" \n\t"
759 "push %%"REG_BP
" \n\t" //use ebp without telling gcc
762 "movlps (%0, %%"REG_S
"), %%xmm0 \n\t" // XXXI
763 "movhps 8(%0, %%"REG_D
"), %%xmm0 \n\t" // RXXI
764 "movlps 8(%0, %%"REG_S
"), %%xmm1 \n\t" // XXXi
765 "movhps (%0, %%"REG_D
"), %%xmm1 \n\t" // rXXi
766 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR
767 "movaps "MANGLE(sseSinCos1c
)"(%%"REG_S
"), %%xmm2\n\t"
768 "mulps %%xmm0, %%xmm2 \n\t"
769 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI
770 "mulps "MANGLE(sseSinCos1d
)"(%%"REG_S
"), %%xmm0\n\t"
771 "subps %%xmm0, %%xmm2 \n\t"
772 "movzb (%%"REG_a
"), %%"REG_d
" \n\t"
773 "movzb 1(%%"REG_a
"), %%"REG_BP
" \n\t"
774 "movlps %%xmm2, (%1, %%"REG_d
", 8) \n\t"
775 "movhps %%xmm2, (%1, %%"REG_BP
", 8) \n\t"
776 "add $16, %%"REG_S
" \n\t"
777 "add $2, %%"REG_a
" \n\t" // avoid complex addressing for P4 crap
778 "sub $16, %%"REG_D
" \n\t"
780 "pop %%"REG_BP
" \n\t"//no we didnt touch ebp *g*
781 :: "b" (data
), "c" (buf
)
782 : "%"REG_S
, "%"REG_D
, "%"REG_a
, "%"REG_d
787 /* unoptimized variant
788 for (m=1; m < 7; m++) {
794 two_m_plus_one = (1 << (m+1));
796 for(i = 0; i < 128; i += two_m_plus_one) {
797 for(k = 0; k < two_m; k++) {
800 tmp_a_r = buf[p].real;
801 tmp_a_i = buf[p].imag;
802 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
803 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
804 buf[p].real = tmp_a_r + tmp_b_r;
805 buf[p].imag = tmp_a_i + tmp_b_i;
806 buf[q].real = tmp_a_r - tmp_b_r;
807 buf[q].imag = tmp_a_i - tmp_b_i;
814 // Note w[0][0]={1,0}
816 "xorps %%xmm1, %%xmm1 \n\t"
817 "xorps %%xmm2, %%xmm2 \n\t"
818 "mov %0, %%"REG_S
" \n\t"
821 "movlps (%%"REG_S
"), %%xmm0\n\t" //buf[p]
822 "movlps 8(%%"REG_S
"), %%xmm1\n\t" //buf[q]
823 "movhps (%%"REG_S
"), %%xmm0\n\t" //buf[p]
824 "movhps 8(%%"REG_S
"), %%xmm2\n\t" //buf[q]
825 "addps %%xmm1, %%xmm0 \n\t"
826 "subps %%xmm2, %%xmm0 \n\t"
827 "movaps %%xmm0, (%%"REG_S
")\n\t"
828 "add $16, %%"REG_S
" \n\t"
829 "cmp %1, %%"REG_S
" \n\t"
831 :: "g" (buf
), "r" (buf
+ 128)
836 // Note w[1]={{1,0}, {0,-1}}
838 "movaps "MANGLE(ps111_1
)", %%xmm7\n\t" // 1,1,1,-1
839 "mov %0, %%"REG_S
" \n\t"
842 "movaps 16(%%"REG_S
"), %%xmm2 \n\t" //r2,i2,r3,i3
843 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3
844 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3
845 "movaps (%%"REG_S
"), %%xmm0 \n\t" //r0,i0,r1,i1
846 "movaps (%%"REG_S
"), %%xmm1 \n\t" //r0,i0,r1,i1
847 "addps %%xmm2, %%xmm0 \n\t"
848 "subps %%xmm2, %%xmm1 \n\t"
849 "movaps %%xmm0, (%%"REG_S
") \n\t"
850 "movaps %%xmm1, 16(%%"REG_S
") \n\t"
851 "add $32, %%"REG_S
" \n\t"
852 "cmp %1, %%"REG_S
" \n\t"
854 :: "g" (buf
), "r" (buf
+ 128)
860 Note sseW2+0={1,1,sqrt(2),sqrt(2))
861 Note sseW2+16={0,0,sqrt(2),-sqrt(2))
862 Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
863 Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
866 "movaps 48+"MANGLE(sseW2
)", %%xmm6\n\t"
867 "movaps 16+"MANGLE(sseW2
)", %%xmm7\n\t"
868 "xorps %%xmm5, %%xmm5 \n\t"
869 "xorps %%xmm2, %%xmm2 \n\t"
870 "mov %0, %%"REG_S
" \n\t"
873 "movaps 32(%%"REG_S
"), %%xmm2 \n\t" //r4,i4,r5,i5
874 "movaps 48(%%"REG_S
"), %%xmm3 \n\t" //r6,i6,r7,i7
875 "movaps "MANGLE(sseW2
)", %%xmm4 \n\t" //r4,i4,r5,i5
876 "movaps 32+"MANGLE(sseW2
)", %%xmm5\n\t" //r6,i6,r7,i7
877 "mulps %%xmm2, %%xmm4 \n\t"
878 "mulps %%xmm3, %%xmm5 \n\t"
879 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5
880 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7
881 "mulps %%xmm6, %%xmm3 \n\t"
882 "mulps %%xmm7, %%xmm2 \n\t"
883 "movaps (%%"REG_S
"), %%xmm0 \n\t" //r0,i0,r1,i1
884 "movaps 16(%%"REG_S
"), %%xmm1 \n\t" //r2,i2,r3,i3
885 "addps %%xmm4, %%xmm2 \n\t"
886 "addps %%xmm5, %%xmm3 \n\t"
887 "movaps %%xmm2, %%xmm4 \n\t"
888 "movaps %%xmm3, %%xmm5 \n\t"
889 "addps %%xmm0, %%xmm2 \n\t"
890 "addps %%xmm1, %%xmm3 \n\t"
891 "subps %%xmm4, %%xmm0 \n\t"
892 "subps %%xmm5, %%xmm1 \n\t"
893 "movaps %%xmm2, (%%"REG_S
") \n\t"
894 "movaps %%xmm3, 16(%%"REG_S
") \n\t"
895 "movaps %%xmm0, 32(%%"REG_S
") \n\t"
896 "movaps %%xmm1, 48(%%"REG_S
") \n\t"
897 "add $64, %%"REG_S
" \n\t"
898 "cmp %1, %%"REG_S
" \n\t"
900 :: "g" (buf
), "r" (buf
+ 128)
904 /* 4-7. iterations */
905 for (m
=3; m
< 7; m
++) {
907 two_m_plus_one
= two_m
<<1;
908 two_m_plus_one_shl3
= (two_m_plus_one
<<3);
909 buf_offset
= buf
+128;
911 "mov %0, %%"REG_S
" \n\t"
914 "xor %%"REG_D
", %%"REG_D
" \n\t" // k
915 "lea (%%"REG_S
", %3), %%"REG_d
" \n\t"
917 "movaps (%%"REG_d
", %%"REG_D
"), %%xmm1 \n\t"
918 "movaps (%4, %%"REG_D
", 2), %%xmm2 \n\t"
919 "mulps %%xmm1, %%xmm2 \n\t"
920 "shufps $0xB1, %%xmm1, %%xmm1 \n\t"
921 "mulps 16(%4, %%"REG_D
", 2), %%xmm1 \n\t"
922 "movaps (%%"REG_S
", %%"REG_D
"), %%xmm0 \n\t"
923 "addps %%xmm2, %%xmm1 \n\t"
924 "movaps %%xmm1, %%xmm2 \n\t"
925 "addps %%xmm0, %%xmm1 \n\t"
926 "subps %%xmm2, %%xmm0 \n\t"
927 "movaps %%xmm1, (%%"REG_S
", %%"REG_D
") \n\t"
928 "movaps %%xmm0, (%%"REG_d
", %%"REG_D
") \n\t"
929 "add $16, %%"REG_D
" \n\t"
930 "cmp %3, %%"REG_D
" \n\t" //FIXME (opt) count against 0
932 "add %2, %%"REG_S
" \n\t"
933 "cmp %1, %%"REG_S
" \n\t"
935 :: "g" (buf
), "m" (buf_offset
), "m" (two_m_plus_one_shl3
), "r" (two_m
<<3),
937 : "%"REG_S
, "%"REG_D
, "%"REG_d
941 /* Post IFFT complex multiply plus IFFT complex conjugate*/
943 "mov $-1024, %%"REG_S
" \n\t"
946 "movaps (%0, %%"REG_S
"), %%xmm0 \n\t"
947 "movaps (%0, %%"REG_S
"), %%xmm1 \n\t"
948 "shufps $0xB1, %%xmm0, %%xmm0 \n\t"
949 "mulps 1024+"MANGLE(sseSinCos1c
)"(%%"REG_S
"), %%xmm1\n\t"
950 "mulps 1024+"MANGLE(sseSinCos1d
)"(%%"REG_S
"), %%xmm0\n\t"
951 "addps %%xmm1, %%xmm0 \n\t"
952 "movaps %%xmm0, (%0, %%"REG_S
") \n\t"
953 "add $16, %%"REG_S
" \n\t"
962 window_ptr
= a52_imdct_window
;
964 /* Window and convert to real valued signal */
966 "xor %%"REG_D
", %%"REG_D
" \n\t" // 0
967 "xor %%"REG_S
", %%"REG_S
" \n\t" // 0
968 "movss %3, %%xmm2 \n\t" // bias
969 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
972 "movlps (%0, %%"REG_S
"), %%xmm0 \n\t" // ? ? A ?
973 "movlps 8(%0, %%"REG_S
"), %%xmm1 \n\t" // ? ? C ?
974 "movhps -16(%0, %%"REG_D
"), %%xmm1 \n\t" // ? D C ?
975 "movhps -8(%0, %%"REG_D
"), %%xmm0 \n\t" // ? B A ?
976 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
977 "mulps "MANGLE(sseWindow
)"(%%"REG_S
"), %%xmm0\n\t"
978 "addps (%2, %%"REG_S
"), %%xmm0 \n\t"
979 "addps %%xmm2, %%xmm0 \n\t"
980 "movaps %%xmm0, (%1, %%"REG_S
") \n\t"
981 "add $16, %%"REG_S
" \n\t"
982 "sub $16, %%"REG_D
" \n\t"
983 "cmp $512, %%"REG_S
" \n\t"
985 :: "r" (buf
+64), "r" (data_ptr
), "r" (delay_ptr
), "m" (bias
)
993 "mov $1024, %%"REG_D
" \n\t" // 512
994 "xor %%"REG_S
", %%"REG_S
" \n\t" // 0
995 "movss %3, %%xmm2 \n\t" // bias
996 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
999 "movlps (%0, %%"REG_S
"), %%xmm0 \n\t" // ? ? ? A
1000 "movlps 8(%0, %%"REG_S
"), %%xmm1 \n\t" // ? ? ? C
1001 "movhps -16(%0, %%"REG_D
"), %%xmm1 \n\t" // D ? ? C
1002 "movhps -8(%0, %%"REG_D
"), %%xmm0 \n\t" // B ? ? A
1003 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
1004 "mulps 512+"MANGLE(sseWindow
)"(%%"REG_S
"), %%xmm0\n\t"
1005 "addps (%2, %%"REG_S
"), %%xmm0 \n\t"
1006 "addps %%xmm2, %%xmm0 \n\t"
1007 "movaps %%xmm0, (%1, %%"REG_S
") \n\t"
1008 "add $16, %%"REG_S
" \n\t"
1009 "sub $16, %%"REG_D
" \n\t"
1010 "cmp $512, %%"REG_S
" \n\t"
1012 :: "r" (buf
), "r" (data_ptr
), "r" (delay_ptr
), "m" (bias
)
1013 : "%"REG_S
, "%"REG_D
1018 /* The trailing edge of the window goes into the delay line */
1022 "xor %%"REG_D
", %%"REG_D
" \n\t" // 0
1023 "xor %%"REG_S
", %%"REG_S
" \n\t" // 0
1026 "movlps (%0, %%"REG_S
"), %%xmm0 \n\t" // ? ? ? A
1027 "movlps 8(%0, %%"REG_S
"), %%xmm1 \n\t" // ? ? ? C
1028 "movhps -16(%0, %%"REG_D
"), %%xmm1 \n\t" // D ? ? C
1029 "movhps -8(%0, %%"REG_D
"), %%xmm0 \n\t" // B ? ? A
1030 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
1031 "mulps 1024+"MANGLE(sseWindow
)"(%%"REG_S
"), %%xmm0\n\t"
1032 "movaps %%xmm0, (%1, %%"REG_S
") \n\t"
1033 "add $16, %%"REG_S
" \n\t"
1034 "sub $16, %%"REG_D
" \n\t"
1035 "cmp $512, %%"REG_S
" \n\t"
1037 :: "r" (buf
+64), "r" (delay_ptr
)
1038 : "%"REG_S
, "%"REG_D
1044 "mov $1024, %%"REG_D
" \n\t" // 1024
1045 "xor %%"REG_S
", %%"REG_S
" \n\t" // 0
1048 "movlps (%0, %%"REG_S
"), %%xmm0 \n\t" // ? ? A ?
1049 "movlps 8(%0, %%"REG_S
"), %%xmm1 \n\t" // ? ? C ?
1050 "movhps -16(%0, %%"REG_D
"), %%xmm1 \n\t" // ? D C ?
1051 "movhps -8(%0, %%"REG_D
"), %%xmm0 \n\t" // ? B A ?
1052 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
1053 "mulps 1536+"MANGLE(sseWindow
)"(%%"REG_S
"), %%xmm0\n\t"
1054 "movaps %%xmm0, (%1, %%"REG_S
") \n\t"
1055 "add $16, %%"REG_S
" \n\t"
1056 "sub $16, %%"REG_D
" \n\t"
1057 "cmp $512, %%"REG_S
" \n\t"
1059 :: "r" (buf
), "r" (delay_ptr
)
1060 : "%"REG_S
, "%"REG_D
1063 #endif // ARCH_X86 || ARCH_X86_64
1065 void a52_imdct_256(sample_t
* data
, sample_t
* delay
, sample_t bias
)
1068 sample_t t_r
, t_i
, a_r
, a_i
, b_r
, b_i
, c_r
, c_i
, d_r
, d_i
, w_1
, w_2
;
1069 const sample_t
* window
= a52_imdct_window
;
1070 complex_t buf1
[64], buf2
[64];
1072 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
1073 for (i
= 0; i
< 64; i
++) {
1078 buf1
[i
].real
= t_i
* data
[254-k
] + t_r
* data
[k
];
1079 buf1
[i
].imag
= t_r
* data
[254-k
] - t_i
* data
[k
];
1081 buf2
[i
].real
= t_i
* data
[255-k
] + t_r
* data
[k
+1];
1082 buf2
[i
].imag
= t_r
* data
[255-k
] - t_i
* data
[k
+1];
1088 /* Post IFFT complex multiply */
1089 /* Window and convert to real valued signal */
1090 for (i
= 0; i
< 32; i
++) {
1091 /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
1092 t_r
= post2
[i
].real
;
1093 t_i
= post2
[i
].imag
;
1095 a_r
= t_r
* buf1
[i
].real
+ t_i
* buf1
[i
].imag
;
1096 a_i
= t_i
* buf1
[i
].real
- t_r
* buf1
[i
].imag
;
1097 b_r
= t_i
* buf1
[63-i
].real
+ t_r
* buf1
[63-i
].imag
;
1098 b_i
= t_r
* buf1
[63-i
].real
- t_i
* buf1
[63-i
].imag
;
1100 c_r
= t_r
* buf2
[i
].real
+ t_i
* buf2
[i
].imag
;
1101 c_i
= t_i
* buf2
[i
].real
- t_r
* buf2
[i
].imag
;
1102 d_r
= t_i
* buf2
[63-i
].real
+ t_r
* buf2
[63-i
].imag
;
1103 d_i
= t_r
* buf2
[63-i
].real
- t_i
* buf2
[63-i
].imag
;
1106 w_2
= window
[255-2*i
];
1107 data
[2*i
] = delay
[2*i
] * w_2
- a_r
* w_1
+ bias
;
1108 data
[255-2*i
] = delay
[2*i
] * w_1
+ a_r
* w_2
+ bias
;
1111 w_1
= window
[128+2*i
];
1112 w_2
= window
[127-2*i
];
1113 data
[128+2*i
] = delay
[127-2*i
] * w_2
+ a_i
* w_1
+ bias
;
1114 data
[127-2*i
] = delay
[127-2*i
] * w_1
- a_i
* w_2
+ bias
;
1115 delay
[127-2*i
] = c_r
;
1117 w_1
= window
[2*i
+1];
1118 w_2
= window
[254-2*i
];
1119 data
[2*i
+1] = delay
[2*i
+1] * w_2
- b_i
* w_1
+ bias
;
1120 data
[254-2*i
] = delay
[2*i
+1] * w_1
+ b_i
* w_2
+ bias
;
1123 w_1
= window
[129+2*i
];
1124 w_2
= window
[126-2*i
];
1125 data
[129+2*i
] = delay
[126-2*i
] * w_2
+ b_r
* w_1
+ bias
;
1126 data
[126-2*i
] = delay
[126-2*i
] * w_1
- b_r
* w_2
+ bias
;
1127 delay
[126-2*i
] = d_i
;
1131 static double besselI0 (double x
)
1137 bessel
= bessel
* x
/ (i
* i
) + 1;
1142 void a52_imdct_init (uint32_t mm_accel
)
1147 /* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */
1149 for (i
= 0; i
< 256; i
++) {
1150 sum
+= besselI0 (i
* (256 - i
) * (5 * M_PI
/ 256) * (5 * M_PI
/ 256));
1151 a52_imdct_window
[i
] = sum
;
1154 for (i
= 0; i
< 256; i
++)
1155 a52_imdct_window
[i
] = sqrt (a52_imdct_window
[i
] / sum
);
1157 for (i
= 0; i
< 3; i
++)
1158 roots16
[i
] = cos ((M_PI
/ 8) * (i
+ 1));
1160 for (i
= 0; i
< 7; i
++)
1161 roots32
[i
] = cos ((M_PI
/ 16) * (i
+ 1));
1163 for (i
= 0; i
< 15; i
++)
1164 roots64
[i
] = cos ((M_PI
/ 32) * (i
+ 1));
1166 for (i
= 0; i
< 31; i
++)
1167 roots128
[i
] = cos ((M_PI
/ 64) * (i
+ 1));
1169 for (i
= 0; i
< 64; i
++) {
1170 k
= fftorder
[i
] / 2 + 64;
1171 pre1
[i
].real
= cos ((M_PI
/ 256) * (k
- 0.25));
1172 pre1
[i
].imag
= sin ((M_PI
/ 256) * (k
- 0.25));
1175 for (i
= 64; i
< 128; i
++) {
1176 k
= fftorder
[i
] / 2 + 64;
1177 pre1
[i
].real
= -cos ((M_PI
/ 256) * (k
- 0.25));
1178 pre1
[i
].imag
= -sin ((M_PI
/ 256) * (k
- 0.25));
1181 for (i
= 0; i
< 64; i
++) {
1182 post1
[i
].real
= cos ((M_PI
/ 256) * (i
+ 0.5));
1183 post1
[i
].imag
= sin ((M_PI
/ 256) * (i
+ 0.5));
1186 for (i
= 0; i
< 64; i
++) {
1187 k
= fftorder
[i
] / 4;
1188 pre2
[i
].real
= cos ((M_PI
/ 128) * (k
- 0.25));
1189 pre2
[i
].imag
= sin ((M_PI
/ 128) * (k
- 0.25));
1192 for (i
= 0; i
< 32; i
++) {
1193 post2
[i
].real
= cos ((M_PI
/ 128) * (i
+ 0.5));
1194 post2
[i
].imag
= sin ((M_PI
/ 128) * (i
+ 0.5));
1196 for (i
= 0; i
< 128; i
++) {
1197 xcos1
[i
] = -cos ((M_PI
/ 2048) * (8 * i
+ 1));
1198 xsin1
[i
] = -sin ((M_PI
/ 2048) * (8 * i
+ 1));
1200 for (i
= 0; i
< 7; i
++) {
1202 for (k
= 0; k
< j
; k
++) {
1203 w
[i
][k
].real
= cos (-M_PI
* k
/ j
);
1204 w
[i
][k
].imag
= sin (-M_PI
* k
/ j
);
1207 #if ARCH_X86 || ARCH_X86_64
1208 for (i
= 0; i
< 128; i
++) {
1209 sseSinCos1c
[2*i
+0]= xcos1
[i
];
1210 sseSinCos1c
[2*i
+1]= -xcos1
[i
];
1211 sseSinCos1d
[2*i
+0]= xsin1
[i
];
1212 sseSinCos1d
[2*i
+1]= xsin1
[i
];
1214 for (i
= 1; i
< 7; i
++) {
1216 for (k
= 0; k
< j
; k
+=2) {
1218 sseW
[i
][4*k
+ 0] = w
[i
][k
+0].real
;
1219 sseW
[i
][4*k
+ 1] = w
[i
][k
+0].real
;
1220 sseW
[i
][4*k
+ 2] = w
[i
][k
+1].real
;
1221 sseW
[i
][4*k
+ 3] = w
[i
][k
+1].real
;
1223 sseW
[i
][4*k
+ 4] = -w
[i
][k
+0].imag
;
1224 sseW
[i
][4*k
+ 5] = w
[i
][k
+0].imag
;
1225 sseW
[i
][4*k
+ 6] = -w
[i
][k
+1].imag
;
1226 sseW
[i
][4*k
+ 7] = w
[i
][k
+1].imag
;
1228 //we multiply more or less uninitalized numbers so we need to use exactly 0.0
1231 // sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
1232 sseW
[i
][4*k
+ 4]= sseW
[i
][4*k
+ 5]= 0.0;
1237 sseW
[i
][4*k
+ 0]= sseW
[i
][4*k
+ 1]= 0.0;
1238 // sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
1243 for(i
=0; i
<128; i
++)
1245 sseWindow
[2*i
+0]= -a52_imdct_window
[2*i
+0];
1246 sseWindow
[2*i
+1]= a52_imdct_window
[2*i
+1];
1251 sseWindow
[256 + 2*i
+0]= -a52_imdct_window
[254 - 2*i
+1];
1252 sseWindow
[256 + 2*i
+1]= a52_imdct_window
[254 - 2*i
+0];
1253 sseWindow
[384 + 2*i
+0]= a52_imdct_window
[126 - 2*i
+1];
1254 sseWindow
[384 + 2*i
+1]= -a52_imdct_window
[126 - 2*i
+0];
1257 a52_imdct_512
= imdct_do_512
;
1258 ifft128
= ifft128_c
;
1261 #if ARCH_X86 || ARCH_X86_64
1262 if(mm_accel
& MM_ACCEL_X86_SSE
)
1264 fprintf (stderr
, "Using SSE optimized IMDCT transform\n");
1265 a52_imdct_512
= imdct_do_512_sse
;
1268 if(mm_accel
& MM_ACCEL_X86_3DNOWEXT
)
1270 fprintf (stderr
, "Using 3DNowEx optimized IMDCT transform\n");
1271 a52_imdct_512
= imdct_do_512_3dnowex
;
1274 if(mm_accel
& MM_ACCEL_X86_3DNOW
)
1276 fprintf (stderr
, "Using 3DNow optimized IMDCT transform\n");
1277 a52_imdct_512
= imdct_do_512_3dnow
;
1280 #endif // ARCH_X86 || ARCH_X86_64
1282 if (mm_accel
& MM_ACCEL_PPC_ALTIVEC
)
1284 fprintf(stderr
, "Using AltiVec optimized IMDCT transform\n");
1285 a52_imdct_512
= imdct_do_512_altivec
;
1290 #ifdef LIBA52_DJBFFT
1291 if (mm_accel
& MM_ACCEL_DJBFFT
) {
1292 fprintf (stderr
, "Using djbfft for IMDCT transform\n");
1293 ifft128
= (void (*) (complex_t
*)) fftc4_un128
;
1294 ifft64
= (void (*) (complex_t
*)) fftc4_un64
;
1298 fprintf (stderr
, "No accelerated IMDCT transform found\n");