3 * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
6 * This file is part of a52dec, a free ATSC A-52 stream decoder.
7 * See http://liba52.sourceforge.net/ for updates.
9 * Modified for use with MPlayer, changes contained in liba52_changes.diff.
10 * detailed CVS changelog at http://www.mplayerhq.hu/cgi-bin/cvsweb.cgi/main/
13 * a52dec is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * a52dec is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
28 * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
29 * michael did port them from libac3 (untested, perhaps totally broken)
30 * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
38 #define M_PI 3.1415926535897932384626433832795029
43 #include "a52_internal.h"
47 #ifdef RUNTIME_CPUDETECT
53 void (* imdct_256
) (sample_t data
[], sample_t delay
[], sample_t bias
);
54 void (* imdct_512
) (sample_t data
[], sample_t delay
[], sample_t bias
);
56 typedef struct complex_s
{
61 static void fft_128p(complex_t
*a
);
63 static const int pm128
[128] attribute_used
__attribute__((aligned(16))) =
65 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
66 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,
67 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,
68 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,
69 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,
70 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
71 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
72 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
75 /* 128 point bit-reverse LUT */
76 static uint8_t attribute_used bit_reverse_512
[] = {
77 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
78 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
79 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
80 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
81 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
82 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
83 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
84 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
85 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
86 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
87 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
88 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
89 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
90 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
91 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
92 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
94 static uint8_t bit_reverse_256
[] = {
95 0x00, 0x20, 0x10, 0x30, 0x08, 0x28, 0x18, 0x38,
96 0x04, 0x24, 0x14, 0x34, 0x0c, 0x2c, 0x1c, 0x3c,
97 0x02, 0x22, 0x12, 0x32, 0x0a, 0x2a, 0x1a, 0x3a,
98 0x06, 0x26, 0x16, 0x36, 0x0e, 0x2e, 0x1e, 0x3e,
99 0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39,
100 0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d,
101 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b,
102 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
104 #if defined(ARCH_X86) || defined(ARCH_X86_64)
105 // NOTE: SSE needs 16byte alignment or it will segfault
107 static complex_t
__attribute__((aligned(16))) buf
[128];
108 static float __attribute__((aligned(16))) sseSinCos1c
[256];
109 static float __attribute__((aligned(16))) sseSinCos1d
[256];
110 static float attribute_used
__attribute__((aligned(16))) ps111_1
[4]={1,1,1,-1};
111 //static float __attribute__((aligned(16))) sseW0[4];
112 static float __attribute__((aligned(16))) sseW1
[8];
113 static float __attribute__((aligned(16))) sseW2
[16];
114 static float __attribute__((aligned(16))) sseW3
[32];
115 static float __attribute__((aligned(16))) sseW4
[64];
116 static float __attribute__((aligned(16))) sseW5
[128];
117 static float __attribute__((aligned(16))) sseW6
[256];
118 static float __attribute__((aligned(16))) *sseW
[7]=
119 {NULL
/*sseW0*/,sseW1
,sseW2
,sseW3
,sseW4
,sseW5
,sseW6
};
120 static float __attribute__((aligned(16))) sseWindow
[512];
122 static complex_t
__attribute__((aligned(16))) buf
[128];
125 /* Twiddle factor LUT */
126 static complex_t
__attribute__((aligned(16))) w_1
[1];
127 static complex_t
__attribute__((aligned(16))) w_2
[2];
128 static complex_t
__attribute__((aligned(16))) w_4
[4];
129 static complex_t
__attribute__((aligned(16))) w_8
[8];
130 static complex_t
__attribute__((aligned(16))) w_16
[16];
131 static complex_t
__attribute__((aligned(16))) w_32
[32];
132 static complex_t
__attribute__((aligned(16))) w_64
[64];
133 static complex_t
__attribute__((aligned(16))) * w
[7] = {w_1
, w_2
, w_4
, w_8
, w_16
, w_32
, w_64
};
135 /* Twiddle factors for IMDCT */
136 static sample_t
__attribute__((aligned(16))) xcos1
[128];
137 static sample_t
__attribute__((aligned(16))) xsin1
[128];
138 static sample_t
__attribute__((aligned(16))) xcos2
[64];
139 static sample_t
__attribute__((aligned(16))) xsin2
[64];
141 /* Windowing function for Modified DCT - Thank you acroread */
142 sample_t imdct_window
[] = {
143 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
144 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
145 0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
146 0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
147 0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
148 0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
149 0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
150 0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
151 0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
152 0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
153 0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
154 0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
155 0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
156 0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
157 0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
158 0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
159 0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
160 0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
161 0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
162 0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
163 0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
164 0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
165 0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
166 0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
167 0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
168 0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
169 0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
170 0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
171 0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
172 0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
173 0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
174 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 };
177 static inline void swap_cmplx(complex_t
*a
, complex_t
*b
)
188 static inline complex_t
cmplx_mult(complex_t a
, complex_t b
)
192 ret
.real
= a
.real
* b
.real
- a
.imag
* b
.imag
;
193 ret
.imag
= a
.real
* b
.imag
+ a
.imag
* b
.real
;
199 imdct_do_512(sample_t data
[],sample_t delay
[], sample_t bias
)
217 sample_t
*window_ptr
;
219 /* 512 IMDCT with source and dest data in 'data' */
221 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
222 for( i
=0; i
< 128; i
++) {
223 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
227 int j
= bit_reverse_512
[i
];
229 buf
[i
].real
= (data
[256-2*j
-1] * xcos1
[j
]) - (data
[2*j
] * xsin1
[j
]);
230 buf
[i
].imag
= -1.0 * ((data
[2*j
] * xcos1
[j
]) + (data
[256-2*j
-1] * xsin1
[j
]));
234 /* unoptimized variant
235 for (m=1; m < 7; m++) {
241 two_m_plus_one = (1 << (m+1));
243 for(i = 0; i < 128; i += two_m_plus_one) {
244 for(k = 0; k < two_m; k++) {
247 tmp_a_r = buf[p].real;
248 tmp_a_i = buf[p].imag;
249 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
250 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
251 buf[p].real = tmp_a_r + tmp_b_r;
252 buf[p].imag = tmp_a_i + tmp_b_i;
253 buf[q].real = tmp_a_r - tmp_b_r;
254 buf[q].imag = tmp_a_i - tmp_b_i;
264 for(i
= 0; i
< 128; i
+= 2) {
265 tmp_a_r
= buf
[i
].real
;
266 tmp_a_i
= buf
[i
].imag
;
267 tmp_b_r
= buf
[i
+1].real
;
268 tmp_b_i
= buf
[i
+1].imag
;
269 buf
[i
].real
= tmp_a_r
+ tmp_b_r
;
270 buf
[i
].imag
= tmp_a_i
+ tmp_b_i
;
271 buf
[i
+1].real
= tmp_a_r
- tmp_b_r
;
272 buf
[i
+1].imag
= tmp_a_i
- tmp_b_i
;
276 // Note w[1]={{1,0}, {0,-1}}
277 for(i
= 0; i
< 128; i
+= 4) {
278 tmp_a_r
= buf
[i
].real
;
279 tmp_a_i
= buf
[i
].imag
;
280 tmp_b_r
= buf
[i
+2].real
;
281 tmp_b_i
= buf
[i
+2].imag
;
282 buf
[i
].real
= tmp_a_r
+ tmp_b_r
;
283 buf
[i
].imag
= tmp_a_i
+ tmp_b_i
;
284 buf
[i
+2].real
= tmp_a_r
- tmp_b_r
;
285 buf
[i
+2].imag
= tmp_a_i
- tmp_b_i
;
286 tmp_a_r
= buf
[i
+1].real
;
287 tmp_a_i
= buf
[i
+1].imag
;
288 tmp_b_r
= buf
[i
+3].imag
;
289 tmp_b_i
= buf
[i
+3].real
;
290 buf
[i
+1].real
= tmp_a_r
+ tmp_b_r
;
291 buf
[i
+1].imag
= tmp_a_i
- tmp_b_i
;
292 buf
[i
+3].real
= tmp_a_r
- tmp_b_r
;
293 buf
[i
+3].imag
= tmp_a_i
+ tmp_b_i
;
297 for(i
= 0; i
< 128; i
+= 8) {
298 tmp_a_r
= buf
[i
].real
;
299 tmp_a_i
= buf
[i
].imag
;
300 tmp_b_r
= buf
[i
+4].real
;
301 tmp_b_i
= buf
[i
+4].imag
;
302 buf
[i
].real
= tmp_a_r
+ tmp_b_r
;
303 buf
[i
].imag
= tmp_a_i
+ tmp_b_i
;
304 buf
[i
+4].real
= tmp_a_r
- tmp_b_r
;
305 buf
[i
+4].imag
= tmp_a_i
- tmp_b_i
;
306 tmp_a_r
= buf
[1+i
].real
;
307 tmp_a_i
= buf
[1+i
].imag
;
308 tmp_b_r
= (buf
[i
+5].real
+ buf
[i
+5].imag
) * w
[2][1].real
;
309 tmp_b_i
= (buf
[i
+5].imag
- buf
[i
+5].real
) * w
[2][1].real
;
310 buf
[1+i
].real
= tmp_a_r
+ tmp_b_r
;
311 buf
[1+i
].imag
= tmp_a_i
+ tmp_b_i
;
312 buf
[i
+5].real
= tmp_a_r
- tmp_b_r
;
313 buf
[i
+5].imag
= tmp_a_i
- tmp_b_i
;
314 tmp_a_r
= buf
[i
+2].real
;
315 tmp_a_i
= buf
[i
+2].imag
;
316 tmp_b_r
= buf
[i
+6].imag
;
317 tmp_b_i
= - buf
[i
+6].real
;
318 buf
[i
+2].real
= tmp_a_r
+ tmp_b_r
;
319 buf
[i
+2].imag
= tmp_a_i
+ tmp_b_i
;
320 buf
[i
+6].real
= tmp_a_r
- tmp_b_r
;
321 buf
[i
+6].imag
= tmp_a_i
- tmp_b_i
;
322 tmp_a_r
= buf
[i
+3].real
;
323 tmp_a_i
= buf
[i
+3].imag
;
324 tmp_b_r
= (buf
[i
+7].real
- buf
[i
+7].imag
) * w
[2][3].imag
;
325 tmp_b_i
= (buf
[i
+7].imag
+ buf
[i
+7].real
) * w
[2][3].imag
;
326 buf
[i
+3].real
= tmp_a_r
+ tmp_b_r
;
327 buf
[i
+3].imag
= tmp_a_i
+ tmp_b_i
;
328 buf
[i
+7].real
= tmp_a_r
- tmp_b_r
;
329 buf
[i
+7].imag
= tmp_a_i
- tmp_b_i
;
332 /* 4-7. iterations */
333 for (m
=3; m
< 7; m
++) {
336 two_m_plus_one
= two_m
<<1;
338 for(i
= 0; i
< 128; i
+= two_m_plus_one
) {
339 for(k
= 0; k
< two_m
; k
++) {
342 tmp_a_r
= buf
[p
].real
;
343 tmp_a_i
= buf
[p
].imag
;
344 tmp_b_r
= buf
[q
].real
* w
[m
][k
].real
- buf
[q
].imag
* w
[m
][k
].imag
;
345 tmp_b_i
= buf
[q
].imag
* w
[m
][k
].real
+ buf
[q
].real
* w
[m
][k
].imag
;
346 buf
[p
].real
= tmp_a_r
+ tmp_b_r
;
347 buf
[p
].imag
= tmp_a_i
+ tmp_b_i
;
348 buf
[q
].real
= tmp_a_r
- tmp_b_r
;
349 buf
[q
].imag
= tmp_a_i
- tmp_b_i
;
354 /* Post IFFT complex multiply plus IFFT complex conjugate*/
355 for( i
=0; i
< 128; i
++) {
356 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
357 tmp_a_r
= buf
[i
].real
;
358 tmp_a_i
= -1.0 * buf
[i
].imag
;
359 buf
[i
].real
=(tmp_a_r
* xcos1
[i
]) - (tmp_a_i
* xsin1
[i
]);
360 buf
[i
].imag
=(tmp_a_r
* xsin1
[i
]) + (tmp_a_i
* xcos1
[i
]);
365 window_ptr
= imdct_window
;
367 /* Window and convert to real valued signal */
368 for(i
=0; i
< 64; i
++) {
369 *data_ptr
++ = -buf
[64+i
].imag
* *window_ptr
++ + *delay_ptr
++ + bias
;
370 *data_ptr
++ = buf
[64-i
-1].real
* *window_ptr
++ + *delay_ptr
++ + bias
;
373 for(i
=0; i
< 64; i
++) {
374 *data_ptr
++ = -buf
[i
].real
* *window_ptr
++ + *delay_ptr
++ + bias
;
375 *data_ptr
++ = buf
[128-i
-1].imag
* *window_ptr
++ + *delay_ptr
++ + bias
;
378 /* The trailing edge of the window goes into the delay line */
381 for(i
=0; i
< 64; i
++) {
382 *delay_ptr
++ = -buf
[64+i
].real
* *--window_ptr
;
383 *delay_ptr
++ = buf
[64-i
-1].imag
* *--window_ptr
;
386 for(i
=0; i
<64; i
++) {
387 *delay_ptr
++ = buf
[i
].imag
* *--window_ptr
;
388 *delay_ptr
++ = -buf
[128-i
-1].real
* *--window_ptr
;
398 // used to build registers permutation vectors (vcprm)
399 // the 's' are for words in the _s_econd vector
400 #define WORD_0 0x00,0x01,0x02,0x03
401 #define WORD_1 0x04,0x05,0x06,0x07
402 #define WORD_2 0x08,0x09,0x0a,0x0b
403 #define WORD_3 0x0c,0x0d,0x0e,0x0f
404 #define WORD_s0 0x10,0x11,0x12,0x13
405 #define WORD_s1 0x14,0x15,0x16,0x17
406 #define WORD_s2 0x18,0x19,0x1a,0x1b
407 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
410 #define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
412 #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
415 // vcprmle is used to keep the same index as in the SSE version.
416 // it's the same as vcprm, with the index inversed
417 // ('le' is Little Endian)
418 #define vcprmle(a,b,c,d) vcprm(d,c,b,a)
420 // used to build inverse/identity vectors (vcii)
421 // n is _n_egative, p is _p_ositive
426 #define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
428 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
432 #define FOUROF(a) (a)
434 #define FOUROF(a) {a,a,a,a}
439 imdct_do_512_altivec(sample_t data
[],sample_t delay
[], sample_t bias
)
455 sample_t
*window_ptr
;
457 /* 512 IMDCT with source and dest data in 'data' */
459 /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
460 for( i
=0; i
< 128; i
++) {
461 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
462 int j
= bit_reverse_512
[i
];
463 buf
[i
].real
= (data
[256-2*j
-1] * xcos1
[j
]) - (data
[2*j
] * xsin1
[j
]);
464 buf
[i
].imag
= -1.0 * ((data
[2*j
] * xcos1
[j
]) + (data
[256-2*j
-1] * xsin1
[j
]));
468 for(i
= 0; i
< 128; i
+= 2) {
470 tmp_a_r
= buf
[i
].real
;
471 tmp_a_i
= buf
[i
].imag
;
472 tmp_b_r
= buf
[i
+1].real
;
473 tmp_b_i
= buf
[i
+1].imag
;
474 buf
[i
].real
= tmp_a_r
+ tmp_b_r
;
475 buf
[i
].imag
= tmp_a_i
+ tmp_b_i
;
476 buf
[i
+1].real
= tmp_a_r
- tmp_b_r
;
477 buf
[i
+1].imag
= tmp_a_i
- tmp_b_i
;
479 vector
float temp
, bufv
;
481 bufv
= vec_ld(i
<< 3, (float*)buf
);
482 temp
= vec_perm(bufv
, bufv
, vcprm(2,3,0,1));
483 bufv
= vec_madd(bufv
, vcii(p
,p
,n
,n
), temp
);
484 vec_st(bufv
, i
<< 3, (float*)buf
);
489 // Note w[1]={{1,0}, {0,-1}}
490 for(i
= 0; i
< 128; i
+= 4) {
492 tmp_a_r
= buf
[i
].real
;
493 tmp_a_i
= buf
[i
].imag
;
494 tmp_b_r
= buf
[i
+2].real
;
495 tmp_b_i
= buf
[i
+2].imag
;
496 buf
[i
].real
= tmp_a_r
+ tmp_b_r
;
497 buf
[i
].imag
= tmp_a_i
+ tmp_b_i
;
498 buf
[i
+2].real
= tmp_a_r
- tmp_b_r
;
499 buf
[i
+2].imag
= tmp_a_i
- tmp_b_i
;
500 tmp_a_r
= buf
[i
+1].real
;
501 tmp_a_i
= buf
[i
+1].imag
;
502 /* WARNING: im <-> re here ! */
503 tmp_b_r
= buf
[i
+3].imag
;
504 tmp_b_i
= buf
[i
+3].real
;
505 buf
[i
+1].real
= tmp_a_r
+ tmp_b_r
;
506 buf
[i
+1].imag
= tmp_a_i
- tmp_b_i
;
507 buf
[i
+3].real
= tmp_a_r
- tmp_b_r
;
508 buf
[i
+3].imag
= tmp_a_i
+ tmp_b_i
;
510 vector
float buf01
, buf23
, temp1
, temp2
;
512 buf01
= vec_ld((i
+ 0) << 3, (float*)buf
);
513 buf23
= vec_ld((i
+ 2) << 3, (float*)buf
);
514 buf23
= vec_perm(buf23
,buf23
,vcprm(0,1,3,2));
516 temp1
= vec_madd(buf23
, vcii(p
,p
,p
,n
), buf01
);
517 temp2
= vec_madd(buf23
, vcii(n
,n
,n
,p
), buf01
);
519 vec_st(temp1
, (i
+ 0) << 3, (float*)buf
);
520 vec_st(temp2
, (i
+ 2) << 3, (float*)buf
);
525 for(i
= 0; i
< 128; i
+= 8) {
527 tmp_a_r
= buf
[i
].real
;
528 tmp_a_i
= buf
[i
].imag
;
529 tmp_b_r
= buf
[i
+4].real
;
530 tmp_b_i
= buf
[i
+4].imag
;
531 buf
[i
].real
= tmp_a_r
+ tmp_b_r
;
532 buf
[i
].imag
= tmp_a_i
+ tmp_b_i
;
533 buf
[i
+4].real
= tmp_a_r
- tmp_b_r
;
534 buf
[i
+4].imag
= tmp_a_i
- tmp_b_i
;
535 tmp_a_r
= buf
[1+i
].real
;
536 tmp_a_i
= buf
[1+i
].imag
;
537 tmp_b_r
= (buf
[i
+5].real
+ buf
[i
+5].imag
) * w
[2][1].real
;
538 tmp_b_i
= (buf
[i
+5].imag
- buf
[i
+5].real
) * w
[2][1].real
;
539 buf
[1+i
].real
= tmp_a_r
+ tmp_b_r
;
540 buf
[1+i
].imag
= tmp_a_i
+ tmp_b_i
;
541 buf
[i
+5].real
= tmp_a_r
- tmp_b_r
;
542 buf
[i
+5].imag
= tmp_a_i
- tmp_b_i
;
543 tmp_a_r
= buf
[i
+2].real
;
544 tmp_a_i
= buf
[i
+2].imag
;
545 /* WARNING re <-> im & sign */
546 tmp_b_r
= buf
[i
+6].imag
;
547 tmp_b_i
= - buf
[i
+6].real
;
548 buf
[i
+2].real
= tmp_a_r
+ tmp_b_r
;
549 buf
[i
+2].imag
= tmp_a_i
+ tmp_b_i
;
550 buf
[i
+6].real
= tmp_a_r
- tmp_b_r
;
551 buf
[i
+6].imag
= tmp_a_i
- tmp_b_i
;
552 tmp_a_r
= buf
[i
+3].real
;
553 tmp_a_i
= buf
[i
+3].imag
;
554 tmp_b_r
= (buf
[i
+7].real
- buf
[i
+7].imag
) * w
[2][3].imag
;
555 tmp_b_i
= (buf
[i
+7].imag
+ buf
[i
+7].real
) * w
[2][3].imag
;
556 buf
[i
+3].real
= tmp_a_r
+ tmp_b_r
;
557 buf
[i
+3].imag
= tmp_a_i
+ tmp_b_i
;
558 buf
[i
+7].real
= tmp_a_r
- tmp_b_r
;
559 buf
[i
+7].imag
= tmp_a_i
- tmp_b_i
;
561 vector
float buf01
, buf23
, buf45
, buf67
;
563 buf01
= vec_ld((i
+ 0) << 3, (float*)buf
);
564 buf23
= vec_ld((i
+ 2) << 3, (float*)buf
);
566 tmp_b_r
= (buf
[i
+5].real
+ buf
[i
+5].imag
) * w
[2][1].real
;
567 tmp_b_i
= (buf
[i
+5].imag
- buf
[i
+5].real
) * w
[2][1].real
;
568 buf
[i
+5].real
= tmp_b_r
;
569 buf
[i
+5].imag
= tmp_b_i
;
570 tmp_b_r
= (buf
[i
+7].real
- buf
[i
+7].imag
) * w
[2][3].imag
;
571 tmp_b_i
= (buf
[i
+7].imag
+ buf
[i
+7].real
) * w
[2][3].imag
;
572 buf
[i
+7].real
= tmp_b_r
;
573 buf
[i
+7].imag
= tmp_b_i
;
575 buf23
= vec_ld((i
+ 2) << 3, (float*)buf
);
576 buf45
= vec_ld((i
+ 4) << 3, (float*)buf
);
577 buf67
= vec_ld((i
+ 6) << 3, (float*)buf
);
578 buf67
= vec_perm(buf67
, buf67
, vcprm(1,0,2,3));
580 vec_st(vec_add(buf01
, buf45
), (i
+ 0) << 3, (float*)buf
);
581 vec_st(vec_madd(buf67
, vcii(p
,n
,p
,p
), buf23
), (i
+ 2) << 3, (float*)buf
);
582 vec_st(vec_sub(buf01
, buf45
), (i
+ 4) << 3, (float*)buf
);
583 vec_st(vec_nmsub(buf67
, vcii(p
,n
,p
,p
), buf23
), (i
+ 6) << 3, (float*)buf
);
587 /* 4-7. iterations */
588 for (m
=3; m
< 7; m
++) {
591 two_m_plus_one
= two_m
<<1;
593 for(i
= 0; i
< 128; i
+= two_m_plus_one
) {
594 for(k
= 0; k
< two_m
; k
+=2) {
598 tmp_a_r
= buf
[p
].real
;
599 tmp_a_i
= buf
[p
].imag
;
601 buf
[q
].real
* w
[m
][k
].real
-
602 buf
[q
].imag
* w
[m
][k
].imag
;
604 buf
[q
].imag
* w
[m
][k
].real
+
605 buf
[q
].real
* w
[m
][k
].imag
;
606 buf
[p
].real
= tmp_a_r
+ tmp_b_r
;
607 buf
[p
].imag
= tmp_a_i
+ tmp_b_i
;
608 buf
[q
].real
= tmp_a_r
- tmp_b_r
;
609 buf
[q
].imag
= tmp_a_i
- tmp_b_i
;
611 tmp_a_r
= buf
[(p
+ 1)].real
;
612 tmp_a_i
= buf
[(p
+ 1)].imag
;
614 buf
[(q
+ 1)].real
* w
[m
][(k
+ 1)].real
-
615 buf
[(q
+ 1)].imag
* w
[m
][(k
+ 1)].imag
;
617 buf
[(q
+ 1)].imag
* w
[m
][(k
+ 1)].real
+
618 buf
[(q
+ 1)].real
* w
[m
][(k
+ 1)].imag
;
619 buf
[(p
+ 1)].real
= tmp_a_r
+ tmp_b_r
;
620 buf
[(p
+ 1)].imag
= tmp_a_i
+ tmp_b_i
;
621 buf
[(q
+ 1)].real
= tmp_a_r
- tmp_b_r
;
622 buf
[(q
+ 1)].imag
= tmp_a_i
- tmp_b_i
;
626 vector
float vecp
, vecq
, vecw
, temp1
, temp2
, temp3
, temp4
;
627 const vector
float vczero
= (const vector
float)FOUROF(0.);
628 // first compute buf[q] and buf[q+1]
629 vecq
= vec_ld(q
<< 3, (float*)buf
);
630 vecw
= vec_ld(0, (float*)&(w
[m
][k
]));
631 temp1
= vec_madd(vecq
, vecw
, vczero
);
632 temp2
= vec_perm(vecq
, vecq
, vcprm(1,0,3,2));
633 temp2
= vec_madd(temp2
, vecw
, vczero
);
634 temp3
= vec_perm(temp1
, temp2
, vcprm(0,s0
,2,s2
));
635 temp4
= vec_perm(temp1
, temp2
, vcprm(1,s1
,3,s3
));
636 vecq
= vec_madd(temp4
, vcii(n
,p
,n
,p
), temp3
);
637 // then butterfly with buf[p] and buf[p+1]
638 vecp
= vec_ld(p
<< 3, (float*)buf
);
640 temp1
= vec_add(vecp
, vecq
);
641 temp2
= vec_sub(vecp
, vecq
);
643 vec_st(temp1
, p
<< 3, (float*)buf
);
644 vec_st(temp2
, q
<< 3, (float*)buf
);
650 /* Post IFFT complex multiply plus IFFT complex conjugate*/
651 for( i
=0; i
< 128; i
+=4) {
652 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
654 tmp_a_r
= buf
[(i
+ 0)].real
;
655 tmp_a_i
= -1.0 * buf
[(i
+ 0)].imag
;
657 (tmp_a_r
* xcos1
[(i
+ 0)]) - (tmp_a_i
* xsin1
[(i
+ 0)]);
659 (tmp_a_r
* xsin1
[(i
+ 0)]) + (tmp_a_i
* xcos1
[(i
+ 0)]);
661 tmp_a_r
= buf
[(i
+ 1)].real
;
662 tmp_a_i
= -1.0 * buf
[(i
+ 1)].imag
;
664 (tmp_a_r
* xcos1
[(i
+ 1)]) - (tmp_a_i
* xsin1
[(i
+ 1)]);
666 (tmp_a_r
* xsin1
[(i
+ 1)]) + (tmp_a_i
* xcos1
[(i
+ 1)]);
668 tmp_a_r
= buf
[(i
+ 2)].real
;
669 tmp_a_i
= -1.0 * buf
[(i
+ 2)].imag
;
671 (tmp_a_r
* xcos1
[(i
+ 2)]) - (tmp_a_i
* xsin1
[(i
+ 2)]);
673 (tmp_a_r
* xsin1
[(i
+ 2)]) + (tmp_a_i
* xcos1
[(i
+ 2)]);
675 tmp_a_r
= buf
[(i
+ 3)].real
;
676 tmp_a_i
= -1.0 * buf
[(i
+ 3)].imag
;
678 (tmp_a_r
* xcos1
[(i
+ 3)]) - (tmp_a_i
* xsin1
[(i
+ 3)]);
680 (tmp_a_r
* xsin1
[(i
+ 3)]) + (tmp_a_i
* xcos1
[(i
+ 3)]);
682 vector
float bufv_0
, bufv_2
, cosv
, sinv
, temp1
, temp2
;
683 vector
float temp0022
, temp1133
, tempCS01
;
684 const vector
float vczero
= (const vector
float)FOUROF(0.);
686 bufv_0
= vec_ld((i
+ 0) << 3, (float*)buf
);
687 bufv_2
= vec_ld((i
+ 2) << 3, (float*)buf
);
689 cosv
= vec_ld(i
<< 2, xcos1
);
690 sinv
= vec_ld(i
<< 2, xsin1
);
692 temp0022
= vec_perm(bufv_0
, bufv_0
, vcprm(0,0,2,2));
693 temp1133
= vec_perm(bufv_0
, bufv_0
, vcprm(1,1,3,3));
694 tempCS01
= vec_perm(cosv
, sinv
, vcprm(0,s0
,1,s1
));
695 temp1
= vec_madd(temp0022
, tempCS01
, vczero
);
696 tempCS01
= vec_perm(cosv
, sinv
, vcprm(s0
,0,s1
,1));
697 temp2
= vec_madd(temp1133
, tempCS01
, vczero
);
698 bufv_0
= vec_madd(temp2
, vcii(p
,n
,p
,n
), temp1
);
700 vec_st(bufv_0
, (i
+ 0) << 3, (float*)buf
);
702 /* idem with bufv_2 and high-order cosv/sinv */
704 temp0022
= vec_perm(bufv_2
, bufv_2
, vcprm(0,0,2,2));
705 temp1133
= vec_perm(bufv_2
, bufv_2
, vcprm(1,1,3,3));
706 tempCS01
= vec_perm(cosv
, sinv
, vcprm(2,s2
,3,s3
));
707 temp1
= vec_madd(temp0022
, tempCS01
, vczero
);
708 tempCS01
= vec_perm(cosv
, sinv
, vcprm(s2
,2,s3
,3));
709 temp2
= vec_madd(temp1133
, tempCS01
, vczero
);
710 bufv_2
= vec_madd(temp2
, vcii(p
,n
,p
,n
), temp1
);
712 vec_st(bufv_2
, (i
+ 2) << 3, (float*)buf
);
719 window_ptr
= imdct_window
;
721 /* Window and convert to real valued signal */
722 for(i
=0; i
< 64; i
++) {
723 *data_ptr
++ = -buf
[64+i
].imag
* *window_ptr
++ + *delay_ptr
++ + bias
;
724 *data_ptr
++ = buf
[64-i
-1].real
* *window_ptr
++ + *delay_ptr
++ + bias
;
727 for(i
=0; i
< 64; i
++) {
728 *data_ptr
++ = -buf
[i
].real
* *window_ptr
++ + *delay_ptr
++ + bias
;
729 *data_ptr
++ = buf
[128-i
-1].imag
* *window_ptr
++ + *delay_ptr
++ + bias
;
732 /* The trailing edge of the window goes into the delay line */
735 for(i
=0; i
< 64; i
++) {
736 *delay_ptr
++ = -buf
[64+i
].real
* *--window_ptr
;
737 *delay_ptr
++ = buf
[64-i
-1].imag
* *--window_ptr
;
740 for(i
=0; i
<64; i
++) {
741 *delay_ptr
++ = buf
[i
].imag
* *--window_ptr
;
742 *delay_ptr
++ = -buf
[128-i
-1].real
* *--window_ptr
;
748 // Stuff below this line is borrowed from libac3
750 #if defined(ARCH_X86) || defined(ARCH_X86_64)
754 #include "srfftp_3dnow.h"
756 const i_cmplx_t x_plus_minus_3dnow
__attribute__ ((aligned (8))) = {{ 0x00000000UL
, 0x80000000UL
}};
757 const i_cmplx_t x_minus_plus_3dnow
__attribute__ ((aligned (8))) = {{ 0x80000000UL
, 0x00000000UL
}};
758 const complex_t HSQRT2_3DNOW
__attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
761 #include "imdct_3dnow.h"
763 #include "imdct_3dnow.h"
766 imdct_do_512_sse(sample_t data
[],sample_t delay
[], sample_t bias
)
773 long two_m_plus_one_shl3
;
774 complex_t
*buf_offset
;
783 sample_t
*window_ptr
;
785 /* 512 IMDCT with source and dest data in 'data' */
786 /* see the c version (dct_do_512()), its allmost identical, just in C */
788 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
789 /* Bit reversed shuffling */
791 "xor %%"REG_S
", %%"REG_S
" \n\t"
792 "lea "MANGLE(bit_reverse_512
)", %%"REG_a
"\n\t"
793 "mov $1008, %%"REG_D
" \n\t"
794 "push %%"REG_BP
" \n\t" //use ebp without telling gcc
797 "movlps (%0, %%"REG_S
"), %%xmm0 \n\t" // XXXI
798 "movhps 8(%0, %%"REG_D
"), %%xmm0 \n\t" // RXXI
799 "movlps 8(%0, %%"REG_S
"), %%xmm1 \n\t" // XXXi
800 "movhps (%0, %%"REG_D
"), %%xmm1 \n\t" // rXXi
801 "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR
802 "movaps "MANGLE(sseSinCos1c
)"(%%"REG_S
"), %%xmm2\n\t"
803 "mulps %%xmm0, %%xmm2 \n\t"
804 "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI
805 "mulps "MANGLE(sseSinCos1d
)"(%%"REG_S
"), %%xmm0\n\t"
806 "subps %%xmm0, %%xmm2 \n\t"
807 "movzb (%%"REG_a
"), %%"REG_d
" \n\t"
808 "movzb 1(%%"REG_a
"), %%"REG_BP
" \n\t"
809 "movlps %%xmm2, (%1, %%"REG_d
", 8) \n\t"
810 "movhps %%xmm2, (%1, %%"REG_BP
", 8) \n\t"
811 "add $16, %%"REG_S
" \n\t"
812 "add $2, %%"REG_a
" \n\t" // avoid complex addressing for P4 crap
813 "sub $16, %%"REG_D
" \n\t"
815 "pop %%"REG_BP
" \n\t"//no we didnt touch ebp *g*
816 :: "r" (data
), "r" (buf
)
817 : "%"REG_S
, "%"REG_D
, "%"REG_a
, "%"REG_d
822 /* unoptimized variant
823 for (m=1; m < 7; m++) {
829 two_m_plus_one = (1 << (m+1));
831 for(i = 0; i < 128; i += two_m_plus_one) {
832 for(k = 0; k < two_m; k++) {
835 tmp_a_r = buf[p].real;
836 tmp_a_i = buf[p].imag;
837 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
838 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
839 buf[p].real = tmp_a_r + tmp_b_r;
840 buf[p].imag = tmp_a_i + tmp_b_i;
841 buf[q].real = tmp_a_r - tmp_b_r;
842 buf[q].imag = tmp_a_i - tmp_b_i;
849 // Note w[0][0]={1,0}
851 "xorps %%xmm1, %%xmm1 \n\t"
852 "xorps %%xmm2, %%xmm2 \n\t"
853 "mov %0, %%"REG_S
" \n\t"
856 "movlps (%%"REG_S
"), %%xmm0\n\t" //buf[p]
857 "movlps 8(%%"REG_S
"), %%xmm1\n\t" //buf[q]
858 "movhps (%%"REG_S
"), %%xmm0\n\t" //buf[p]
859 "movhps 8(%%"REG_S
"), %%xmm2\n\t" //buf[q]
860 "addps %%xmm1, %%xmm0 \n\t"
861 "subps %%xmm2, %%xmm0 \n\t"
862 "movaps %%xmm0, (%%"REG_S
")\n\t"
863 "add $16, %%"REG_S
" \n\t"
864 "cmp %1, %%"REG_S
" \n\t"
866 :: "g" (buf
), "r" (buf
+ 128)
871 // Note w[1]={{1,0}, {0,-1}}
873 "movaps "MANGLE(ps111_1
)", %%xmm7\n\t" // 1,1,1,-1
874 "mov %0, %%"REG_S
" \n\t"
877 "movaps 16(%%"REG_S
"), %%xmm2 \n\t" //r2,i2,r3,i3
878 "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3
879 "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3
880 "movaps (%%"REG_S
"), %%xmm0 \n\t" //r0,i0,r1,i1
881 "movaps (%%"REG_S
"), %%xmm1 \n\t" //r0,i0,r1,i1
882 "addps %%xmm2, %%xmm0 \n\t"
883 "subps %%xmm2, %%xmm1 \n\t"
884 "movaps %%xmm0, (%%"REG_S
") \n\t"
885 "movaps %%xmm1, 16(%%"REG_S
") \n\t"
886 "add $32, %%"REG_S
" \n\t"
887 "cmp %1, %%"REG_S
" \n\t"
889 :: "g" (buf
), "r" (buf
+ 128)
895 Note sseW2+0={1,1,sqrt(2),sqrt(2))
896 Note sseW2+16={0,0,sqrt(2),-sqrt(2))
897 Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
898 Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
901 "movaps 48+"MANGLE(sseW2
)", %%xmm6\n\t"
902 "movaps 16+"MANGLE(sseW2
)", %%xmm7\n\t"
903 "xorps %%xmm5, %%xmm5 \n\t"
904 "xorps %%xmm2, %%xmm2 \n\t"
905 "mov %0, %%"REG_S
" \n\t"
908 "movaps 32(%%"REG_S
"), %%xmm2 \n\t" //r4,i4,r5,i5
909 "movaps 48(%%"REG_S
"), %%xmm3 \n\t" //r6,i6,r7,i7
910 "movaps "MANGLE(sseW2
)", %%xmm4 \n\t" //r4,i4,r5,i5
911 "movaps 32+"MANGLE(sseW2
)", %%xmm5\n\t" //r6,i6,r7,i7
912 "mulps %%xmm2, %%xmm4 \n\t"
913 "mulps %%xmm3, %%xmm5 \n\t"
914 "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5
915 "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7
916 "mulps %%xmm6, %%xmm3 \n\t"
917 "mulps %%xmm7, %%xmm2 \n\t"
918 "movaps (%%"REG_S
"), %%xmm0 \n\t" //r0,i0,r1,i1
919 "movaps 16(%%"REG_S
"), %%xmm1 \n\t" //r2,i2,r3,i3
920 "addps %%xmm4, %%xmm2 \n\t"
921 "addps %%xmm5, %%xmm3 \n\t"
922 "movaps %%xmm2, %%xmm4 \n\t"
923 "movaps %%xmm3, %%xmm5 \n\t"
924 "addps %%xmm0, %%xmm2 \n\t"
925 "addps %%xmm1, %%xmm3 \n\t"
926 "subps %%xmm4, %%xmm0 \n\t"
927 "subps %%xmm5, %%xmm1 \n\t"
928 "movaps %%xmm2, (%%"REG_S
") \n\t"
929 "movaps %%xmm3, 16(%%"REG_S
") \n\t"
930 "movaps %%xmm0, 32(%%"REG_S
") \n\t"
931 "movaps %%xmm1, 48(%%"REG_S
") \n\t"
932 "add $64, %%"REG_S
" \n\t"
933 "cmp %1, %%"REG_S
" \n\t"
935 :: "g" (buf
), "r" (buf
+ 128)
939 /* 4-7. iterations */
940 for (m
=3; m
< 7; m
++) {
942 two_m_plus_one
= two_m
<<1;
943 two_m_plus_one_shl3
= (two_m_plus_one
<<3);
944 buf_offset
= buf
+128;
946 "mov %0, %%"REG_S
" \n\t"
949 "xor %%"REG_D
", %%"REG_D
" \n\t" // k
950 "lea (%%"REG_S
", %3), %%"REG_d
" \n\t"
952 "movaps (%%"REG_d
", %%"REG_D
"), %%xmm1 \n\t"
953 "movaps (%4, %%"REG_D
", 2), %%xmm2 \n\t"
954 "mulps %%xmm1, %%xmm2 \n\t"
955 "shufps $0xB1, %%xmm1, %%xmm1 \n\t"
956 "mulps 16(%4, %%"REG_D
", 2), %%xmm1 \n\t"
957 "movaps (%%"REG_S
", %%"REG_D
"), %%xmm0 \n\t"
958 "addps %%xmm2, %%xmm1 \n\t"
959 "movaps %%xmm1, %%xmm2 \n\t"
960 "addps %%xmm0, %%xmm1 \n\t"
961 "subps %%xmm2, %%xmm0 \n\t"
962 "movaps %%xmm1, (%%"REG_S
", %%"REG_D
") \n\t"
963 "movaps %%xmm0, (%%"REG_d
", %%"REG_D
") \n\t"
964 "add $16, %%"REG_D
" \n\t"
965 "cmp %3, %%"REG_D
" \n\t" //FIXME (opt) count against 0
967 "add %2, %%"REG_S
" \n\t"
968 "cmp %1, %%"REG_S
" \n\t"
970 :: "g" (buf
), "m" (buf_offset
), "m" (two_m_plus_one_shl3
), "r" (two_m
<<3),
972 : "%"REG_S
, "%"REG_D
, "%"REG_d
976 /* Post IFFT complex multiply plus IFFT complex conjugate*/
978 "mov $-1024, %%"REG_S
" \n\t"
981 "movaps (%0, %%"REG_S
"), %%xmm0 \n\t"
982 "movaps (%0, %%"REG_S
"), %%xmm1 \n\t"
983 "shufps $0xB1, %%xmm0, %%xmm0 \n\t"
984 "mulps 1024+"MANGLE(sseSinCos1c
)"(%%"REG_S
"), %%xmm1\n\t"
985 "mulps 1024+"MANGLE(sseSinCos1d
)"(%%"REG_S
"), %%xmm0\n\t"
986 "addps %%xmm1, %%xmm0 \n\t"
987 "movaps %%xmm0, (%0, %%"REG_S
") \n\t"
988 "add $16, %%"REG_S
" \n\t"
997 window_ptr
= imdct_window
;
999 /* Window and convert to real valued signal */
1001 "xor %%"REG_D
", %%"REG_D
" \n\t" // 0
1002 "xor %%"REG_S
", %%"REG_S
" \n\t" // 0
1003 "movss %3, %%xmm2 \n\t" // bias
1004 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
1007 "movlps (%0, %%"REG_S
"), %%xmm0 \n\t" // ? ? A ?
1008 "movlps 8(%0, %%"REG_S
"), %%xmm1 \n\t" // ? ? C ?
1009 "movhps -16(%0, %%"REG_D
"), %%xmm1 \n\t" // ? D C ?
1010 "movhps -8(%0, %%"REG_D
"), %%xmm0 \n\t" // ? B A ?
1011 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
1012 "mulps "MANGLE(sseWindow
)"(%%"REG_S
"), %%xmm0\n\t"
1013 "addps (%2, %%"REG_S
"), %%xmm0 \n\t"
1014 "addps %%xmm2, %%xmm0 \n\t"
1015 "movaps %%xmm0, (%1, %%"REG_S
") \n\t"
1016 "add $16, %%"REG_S
" \n\t"
1017 "sub $16, %%"REG_D
" \n\t"
1018 "cmp $512, %%"REG_S
" \n\t"
1020 :: "r" (buf
+64), "r" (data_ptr
), "r" (delay_ptr
), "m" (bias
)
1021 : "%"REG_S
, "%"REG_D
1028 "mov $1024, %%"REG_D
" \n\t" // 512
1029 "xor %%"REG_S
", %%"REG_S
" \n\t" // 0
1030 "movss %3, %%xmm2 \n\t" // bias
1031 "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
1034 "movlps (%0, %%"REG_S
"), %%xmm0 \n\t" // ? ? ? A
1035 "movlps 8(%0, %%"REG_S
"), %%xmm1 \n\t" // ? ? ? C
1036 "movhps -16(%0, %%"REG_D
"), %%xmm1 \n\t" // D ? ? C
1037 "movhps -8(%0, %%"REG_D
"), %%xmm0 \n\t" // B ? ? A
1038 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
1039 "mulps 512+"MANGLE(sseWindow
)"(%%"REG_S
"), %%xmm0\n\t"
1040 "addps (%2, %%"REG_S
"), %%xmm0 \n\t"
1041 "addps %%xmm2, %%xmm0 \n\t"
1042 "movaps %%xmm0, (%1, %%"REG_S
") \n\t"
1043 "add $16, %%"REG_S
" \n\t"
1044 "sub $16, %%"REG_D
" \n\t"
1045 "cmp $512, %%"REG_S
" \n\t"
1047 :: "r" (buf
), "r" (data_ptr
), "r" (delay_ptr
), "m" (bias
)
1048 : "%"REG_S
, "%"REG_D
1053 /* The trailing edge of the window goes into the delay line */
1057 "xor %%"REG_D
", %%"REG_D
" \n\t" // 0
1058 "xor %%"REG_S
", %%"REG_S
" \n\t" // 0
1061 "movlps (%0, %%"REG_S
"), %%xmm0 \n\t" // ? ? ? A
1062 "movlps 8(%0, %%"REG_S
"), %%xmm1 \n\t" // ? ? ? C
1063 "movhps -16(%0, %%"REG_D
"), %%xmm1 \n\t" // D ? ? C
1064 "movhps -8(%0, %%"REG_D
"), %%xmm0 \n\t" // B ? ? A
1065 "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
1066 "mulps 1024+"MANGLE(sseWindow
)"(%%"REG_S
"), %%xmm0\n\t"
1067 "movaps %%xmm0, (%1, %%"REG_S
") \n\t"
1068 "add $16, %%"REG_S
" \n\t"
1069 "sub $16, %%"REG_D
" \n\t"
1070 "cmp $512, %%"REG_S
" \n\t"
1072 :: "r" (buf
+64), "r" (delay_ptr
)
1073 : "%"REG_S
, "%"REG_D
1079 "mov $1024, %%"REG_D
" \n\t" // 1024
1080 "xor %%"REG_S
", %%"REG_S
" \n\t" // 0
1083 "movlps (%0, %%"REG_S
"), %%xmm0 \n\t" // ? ? A ?
1084 "movlps 8(%0, %%"REG_S
"), %%xmm1 \n\t" // ? ? C ?
1085 "movhps -16(%0, %%"REG_D
"), %%xmm1 \n\t" // ? D C ?
1086 "movhps -8(%0, %%"REG_D
"), %%xmm0 \n\t" // ? B A ?
1087 "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
1088 "mulps 1536+"MANGLE(sseWindow
)"(%%"REG_S
"), %%xmm0\n\t"
1089 "movaps %%xmm0, (%1, %%"REG_S
") \n\t"
1090 "add $16, %%"REG_S
" \n\t"
1091 "sub $16, %%"REG_D
" \n\t"
1092 "cmp $512, %%"REG_S
" \n\t"
1094 :: "r" (buf
), "r" (delay_ptr
)
1095 : "%"REG_S
, "%"REG_D
1098 #endif // ARCH_X86 || ARCH_X86_64
1101 imdct_do_256(sample_t data
[],sample_t delay
[],sample_t bias
)
1115 sample_t
*delay_ptr
;
1116 sample_t
*window_ptr
;
1118 complex_t
*buf_1
, *buf_2
;
1123 /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
1124 for(k
=0; k
<64; k
++) {
1125 /* X1[k] = X[2*k] */
1126 /* X2[k] = X[2*k+1] */
1128 p
= 2 * (128-2*k
-1);
1131 /* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
1132 buf_1
[k
].real
= data
[p
] * xcos2
[k
] - data
[q
] * xsin2
[k
];
1133 buf_1
[k
].imag
= -1.0f
* (data
[q
] * xcos2
[k
] + data
[p
] * xsin2
[k
]);
1134 /* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
1135 buf_2
[k
].real
= data
[p
+ 1] * xcos2
[k
] - data
[q
+ 1] * xsin2
[k
];
1136 buf_2
[k
].imag
= -1.0f
* ( data
[q
+ 1] * xcos2
[k
] + data
[p
+ 1] * xsin2
[k
]);
1139 /* IFFT Bit reversed shuffling */
1140 for(i
=0; i
<64; i
++) {
1141 k
= bit_reverse_256
[i
];
1143 swap_cmplx(&buf_1
[i
],&buf_1
[k
]);
1144 swap_cmplx(&buf_2
[i
],&buf_2
[k
]);
1149 for (m
=0; m
< 6; m
++) {
1151 two_m_plus_one
= (1 << (m
+1));
1159 for(k
= 0; k
< two_m
; k
++) {
1160 for(i
= 0; i
< 64; i
+= two_m_plus_one
) {
1164 tmp_a_r
= buf_1
[p
].real
;
1165 tmp_a_i
= buf_1
[p
].imag
;
1166 tmp_b_r
= buf_1
[q
].real
* w
[m
][k
].real
- buf_1
[q
].imag
* w
[m
][k
].imag
;
1167 tmp_b_i
= buf_1
[q
].imag
* w
[m
][k
].real
+ buf_1
[q
].real
* w
[m
][k
].imag
;
1168 buf_1
[p
].real
= tmp_a_r
+ tmp_b_r
;
1169 buf_1
[p
].imag
= tmp_a_i
+ tmp_b_i
;
1170 buf_1
[q
].real
= tmp_a_r
- tmp_b_r
;
1171 buf_1
[q
].imag
= tmp_a_i
- tmp_b_i
;
1174 tmp_a_r
= buf_2
[p
].real
;
1175 tmp_a_i
= buf_2
[p
].imag
;
1176 tmp_b_r
= buf_2
[q
].real
* w
[m
][k
].real
- buf_2
[q
].imag
* w
[m
][k
].imag
;
1177 tmp_b_i
= buf_2
[q
].imag
* w
[m
][k
].real
+ buf_2
[q
].real
* w
[m
][k
].imag
;
1178 buf_2
[p
].real
= tmp_a_r
+ tmp_b_r
;
1179 buf_2
[p
].imag
= tmp_a_i
+ tmp_b_i
;
1180 buf_2
[q
].real
= tmp_a_r
- tmp_b_r
;
1181 buf_2
[q
].imag
= tmp_a_i
- tmp_b_i
;
1186 /* Post IFFT complex multiply */
1187 for( i
=0; i
< 64; i
++) {
1188 /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
1189 tmp_a_r
= buf_1
[i
].real
;
1190 tmp_a_i
= -buf_1
[i
].imag
;
1191 buf_1
[i
].real
=(tmp_a_r
* xcos2
[i
]) - (tmp_a_i
* xsin2
[i
]);
1192 buf_1
[i
].imag
=(tmp_a_r
* xsin2
[i
]) + (tmp_a_i
* xcos2
[i
]);
1193 /* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */
1194 tmp_a_r
= buf_2
[i
].real
;
1195 tmp_a_i
= -buf_2
[i
].imag
;
1196 buf_2
[i
].real
=(tmp_a_r
* xcos2
[i
]) - (tmp_a_i
* xsin2
[i
]);
1197 buf_2
[i
].imag
=(tmp_a_r
* xsin2
[i
]) + (tmp_a_i
* xcos2
[i
]);
1202 window_ptr
= imdct_window
;
1204 /* Window and convert to real valued signal */
1205 for(i
=0; i
< 64; i
++) {
1206 *data_ptr
++ = -buf_1
[i
].imag
* *window_ptr
++ + *delay_ptr
++ + bias
;
1207 *data_ptr
++ = buf_1
[64-i
-1].real
* *window_ptr
++ + *delay_ptr
++ + bias
;
1210 for(i
=0; i
< 64; i
++) {
1211 *data_ptr
++ = -buf_1
[i
].real
* *window_ptr
++ + *delay_ptr
++ + bias
;
1212 *data_ptr
++ = buf_1
[64-i
-1].imag
* *window_ptr
++ + *delay_ptr
++ + bias
;
1217 for(i
=0; i
< 64; i
++) {
1218 *delay_ptr
++ = -buf_2
[i
].real
* *--window_ptr
;
1219 *delay_ptr
++ = buf_2
[64-i
-1].imag
* *--window_ptr
;
1222 for(i
=0; i
< 64; i
++) {
1223 *delay_ptr
++ = buf_2
[i
].imag
* *--window_ptr
;
1224 *delay_ptr
++ = -buf_2
[64-i
-1].real
* *--window_ptr
;
1228 void imdct_init (uint32_t mm_accel
)
1231 if (mm_accel
& MM_ACCEL_MLIB
) {
1232 fprintf (stderr
, "Using mlib for IMDCT transform\n");
1233 imdct_512
= imdct_do_512_mlib
;
1234 imdct_256
= imdct_do_256_mlib
;
1240 /* Twiddle factors to turn IFFT into IMDCT */
1241 for (i
= 0; i
< 128; i
++) {
1242 xcos1
[i
] = -cos ((M_PI
/ 2048) * (8 * i
+ 1));
1243 xsin1
[i
] = -sin ((M_PI
/ 2048) * (8 * i
+ 1));
1245 #if defined(ARCH_X86) || defined(ARCH_X86_64)
1246 for (i
= 0; i
< 128; i
++) {
1247 sseSinCos1c
[2*i
+0]= xcos1
[i
];
1248 sseSinCos1c
[2*i
+1]= -xcos1
[i
];
1249 sseSinCos1d
[2*i
+0]= xsin1
[i
];
1250 sseSinCos1d
[2*i
+1]= xsin1
[i
];
1254 /* More twiddle factors to turn IFFT into IMDCT */
1255 for (i
= 0; i
< 64; i
++) {
1256 xcos2
[i
] = -cos ((M_PI
/ 1024) * (8 * i
+ 1));
1257 xsin2
[i
] = -sin ((M_PI
/ 1024) * (8 * i
+ 1));
1260 for (i
= 0; i
< 7; i
++) {
1262 for (k
= 0; k
< j
; k
++) {
1263 w
[i
][k
].real
= cos (-M_PI
* k
/ j
);
1264 w
[i
][k
].imag
= sin (-M_PI
* k
/ j
);
1267 #if defined(ARCH_X86) || defined(ARCH_X86_64)
1268 for (i
= 1; i
< 7; i
++) {
1270 for (k
= 0; k
< j
; k
+=2) {
1272 sseW
[i
][4*k
+ 0] = w
[i
][k
+0].real
;
1273 sseW
[i
][4*k
+ 1] = w
[i
][k
+0].real
;
1274 sseW
[i
][4*k
+ 2] = w
[i
][k
+1].real
;
1275 sseW
[i
][4*k
+ 3] = w
[i
][k
+1].real
;
1277 sseW
[i
][4*k
+ 4] = -w
[i
][k
+0].imag
;
1278 sseW
[i
][4*k
+ 5] = w
[i
][k
+0].imag
;
1279 sseW
[i
][4*k
+ 6] = -w
[i
][k
+1].imag
;
1280 sseW
[i
][4*k
+ 7] = w
[i
][k
+1].imag
;
1282 //we multiply more or less uninitalized numbers so we need to use exactly 0.0
1285 // sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
1286 sseW
[i
][4*k
+ 4]= sseW
[i
][4*k
+ 5]= 0.0;
1291 sseW
[i
][4*k
+ 0]= sseW
[i
][4*k
+ 1]= 0.0;
1292 // sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
1297 for(i
=0; i
<128; i
++)
1299 sseWindow
[2*i
+0]= -imdct_window
[2*i
+0];
1300 sseWindow
[2*i
+1]= imdct_window
[2*i
+1];
1305 sseWindow
[256 + 2*i
+0]= -imdct_window
[254 - 2*i
+1];
1306 sseWindow
[256 + 2*i
+1]= imdct_window
[254 - 2*i
+0];
1307 sseWindow
[384 + 2*i
+0]= imdct_window
[126 - 2*i
+1];
1308 sseWindow
[384 + 2*i
+1]= -imdct_window
[126 - 2*i
+0];
1310 #endif // ARCH_X86 || ARCH_X86_64
1312 imdct_512
= imdct_do_512
;
1313 #if defined(ARCH_X86) || defined(ARCH_X86_64)
1314 if(mm_accel
& MM_ACCEL_X86_SSE
)
1316 fprintf (stderr
, "Using SSE optimized IMDCT transform\n");
1317 imdct_512
= imdct_do_512_sse
;
1320 if(mm_accel
& MM_ACCEL_X86_3DNOWEXT
)
1322 fprintf (stderr
, "Using 3DNowEx optimized IMDCT transform\n");
1323 imdct_512
= imdct_do_512_3dnowex
;
1326 if(mm_accel
& MM_ACCEL_X86_3DNOW
)
1328 fprintf (stderr
, "Using 3DNow optimized IMDCT transform\n");
1329 imdct_512
= imdct_do_512_3dnow
;
1332 #endif // ARCH_X86 || ARCH_X86_64
1334 if (mm_accel
& MM_ACCEL_PPC_ALTIVEC
)
1336 fprintf(stderr
, "Using AltiVec optimized IMDCT transform\n");
1337 imdct_512
= imdct_do_512_altivec
;
1341 fprintf (stderr
, "No accelerated IMDCT transform found\n");
1342 imdct_256
= imdct_do_256
;
1346 static void fft_asmb(int k
, complex_t
*x
, complex_t
*wTB
,
1347 const complex_t
*d
, const complex_t
*d_3
)
1349 register complex_t
*x2k
, *x3k
, *x4k
, *wB
;
1350 register float a_r
, a_i
, a1_r
, a1_i
, u_r
, u_i
, v_r
, v_i
;
1357 TRANSZERO(x
[0],x2k
[0],x3k
[0],x4k
[0]);
1358 TRANS(x
[1],x2k
[1],x3k
[1],x4k
[1],wTB
[1],wB
[1],d
[1],d_3
[1]);
1362 TRANS(x
[2],x2k
[2],x3k
[2],x4k
[2],wTB
[2],wB
[2],d
[2],d_3
[2]);
1363 TRANS(x
[3],x2k
[3],x3k
[3],x4k
[3],wTB
[3],wB
[3],d
[3],d_3
[3]);
1377 static void fft_asmb16(complex_t
*x
, complex_t
*wTB
)
1379 register float a_r
, a_i
, a1_r
, a1_i
, u_r
, u_i
, v_r
, v_i
;
1382 /* transform x[0], x[8], x[4], x[12] */
1383 TRANSZERO(x
[0],x
[4],x
[8],x
[12]);
1385 /* transform x[1], x[9], x[5], x[13] */
1386 TRANS(x
[1],x
[5],x
[9],x
[13],wTB
[1],wTB
[5],delta16
[1],delta16_3
[1]);
1388 /* transform x[2], x[10], x[6], x[14] */
1389 TRANSHALF_16(x
[2],x
[6],x
[10],x
[14]);
1391 /* transform x[3], x[11], x[7], x[15] */
1392 TRANS(x
[3],x
[7],x
[11],x
[15],wTB
[3],wTB
[7],delta16
[3],delta16_3
[3]);
1396 static void fft_4(complex_t
*x
)
1398 /* delta_p = 1 here */
1399 /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
1402 register float yt_r
, yt_i
, yb_r
, yb_i
, u_r
, u_i
, vi_r
, vi_i
;
1405 yb_r
= yt_r
- x
[2].real
;
1409 vi_i
= x
[3].real
- u_r
;
1413 vi_r
= u_i
- x
[3].imag
;
1428 yb_i
= yt_i
- x
[2].imag
;
1444 static void fft_8(complex_t
*x
)
1446 /* delta_p = diag{1, sqrt(i)} here */
1447 /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8}
1449 register float wT1_r
, wT1_i
, wB1_r
, wB1_i
, wT2_r
, wT2_i
, wB2_r
, wB2_i
;
1469 wT2_i
= x
[0].real
- wT2_i
;
1479 wT2_i
= x
[0].imag
- wT2_i
;
1490 wT2_i
= x
[2].real
- wT2_i
;
1500 wT2_i
= x
[2].imag
- wT2_i
;
1522 wT2_r
= x
[1].real
- wT2_r
;
1530 wT2_i
= x
[1].imag
- wT2_i
;
1538 wB1_r
= wB2_i
- x
[7].imag
;
1539 wB1_i
= wB2_r
+ x
[7].real
;
1542 wB1_r
= wT1_r
+ wT1_i
;
1547 wB2_i
= wB2_r
+ wT1_i
;
1552 wB2_r
= wB2_i
+ wB1_r
;
1559 static void fft_128p(complex_t
*a
)
1561 fft_8(&a
[0]); fft_4(&a
[8]); fft_4(&a
[12]);
1562 fft_asmb16(&a
[0], &a
[8]);
1564 fft_8(&a
[16]), fft_8(&a
[24]);
1565 fft_asmb(4, &a
[0], &a
[16],&delta32
[0], &delta32_3
[0]);
1567 fft_8(&a
[32]); fft_4(&a
[40]); fft_4(&a
[44]);
1568 fft_asmb16(&a
[32], &a
[40]);
1570 fft_8(&a
[48]); fft_4(&a
[56]); fft_4(&a
[60]);
1571 fft_asmb16(&a
[48], &a
[56]);
1573 fft_asmb(8, &a
[0], &a
[32],&delta64
[0], &delta64_3
[0]);
1575 fft_8(&a
[64]); fft_4(&a
[72]); fft_4(&a
[76]);
1576 /* fft_16(&a[64]); */
1577 fft_asmb16(&a
[64], &a
[72]);
1579 fft_8(&a
[80]); fft_8(&a
[88]);
1581 /* fft_32(&a[64]); */
1582 fft_asmb(4, &a
[64], &a
[80],&delta32
[0], &delta32_3
[0]);
1584 fft_8(&a
[96]); fft_4(&a
[104]), fft_4(&a
[108]);
1585 /* fft_16(&a[96]); */
1586 fft_asmb16(&a
[96], &a
[104]);
1588 fft_8(&a
[112]), fft_8(&a
[120]);
1589 /* fft_32(&a[96]); */
1590 fft_asmb(4, &a
[96], &a
[112], &delta32
[0], &delta32_3
[0]);
1592 /* fft_128(&a[0]); */
1593 fft_asmb(16, &a
[0], &a
[64], &delta128
[0], &delta128_3
[0]);