2 * Mpeg Layer-1,2,3 audio decoder
3 * ------------------------------
4 * copyright (c) 1995,1996,1997 by Michael Hipp, All rights reserved.
7 * slighlty optimized for machines without autoincrement/decrement.
8 * The performance is highly compiler dependend. Maybe
9 * the decode.c version for 'normal' processor may be faster
10 * even for Intel processors.
14 #include "../config.h"
17 /* old WRITE_SAMPLE */
19 #define WRITE_SAMPLE(samples,sum,clip) { \
20 if( (sum) > 32767.0) { *(samples) = 0x7fff; (clip)++; } \
21 else if( (sum) < -32768.0) { *(samples) = -0x8000; (clip)++; }\
22 else { *(samples) = sum; } \
25 /* new WRITE_SAMPLE */
28 * should be the same as the "old WRITE_SAMPLE" macro above, but uses
29 * some tricks to avoid double->int conversions and floating point compares.
31 * Here's how it works:
32 * ((((65536.0 * 65536.0 * 16)+(65536.0 * 0.5))* 65536.0)) is
33 * 0x0010000080000000LL in hex. It computes 0x0010000080000000LL + sum
34 * as a double IEEE fp value and extracts the low-order 32-bits from the
35 * IEEE fp representation stored in memory. The 2^56 bit in the constant
36 * is intended to force the bits of "sum" into the least significant bits
37 * of the double mantissa. After an integer substraction of 0x80000000
38 * we have the original double value "sum" converted to an 32-bit int value.
40 * (Is that really faster than the clean and simple old version of the macro?)
44 * On a SPARC cpu, we fetch the low-order 32-bit from the second 32-bit
45 * word of the double fp value stored in memory. On an x86 cpu, we fetch it
46 * from the first 32-bit word.
47 * I'm not sure if the WORDS_BIGENDIAN feature test covers all possible memory
48 * layouts of double floating point values an all cpu architectures. If
49 * it doesn't work for you, just enable the "old WRITE_SAMPLE" macro.
52 #define MANTISSA_OFFSET 1
54 #define MANTISSA_OFFSET 0
57 /* sizeof(int) == 4 */
58 #define WRITE_SAMPLE(samples,sum,clip) { \
59 union { double dtemp; int itemp[2]; } u; int v; \
60 u.dtemp = ((((65536.0 * 65536.0 * 16)+(65536.0 * 0.5))* 65536.0)) + (sum);\
61 v = u.itemp[MANTISSA_OFFSET] - 0x80000000; \
62 if( v > 32767) { *(samples) = 0x7fff; (clip)++; } \
63 else if( v < -32768) { *(samples) = -0x8000; (clip)++; } \
64 else { *(samples) = v; } \
70 #define WRITE_SAMPLE(samples,sum,clip) { \
71 double dtemp; int v; \
72 dtemp = ((((65536.0 * 65536.0 * 16)+(65536.0 * 0.5))* 65536.0)) + (sum);\
73 v = ((*(int *)&dtemp) - 0x80000000); \
74 if( v > 32767) { *(samples) = 0x7fff; (clip)++; } \
75 else if( v < -32768) { *(samples) = -0x8000; (clip)++; } \
76 else { *(samples) = v; } \
80 static int synth_1to1_mono(real
*bandPtr
,unsigned char *samples
,int *pnt
)
82 short samples_tmp
[64];
83 short *tmp1
= samples_tmp
;
87 ret
= synth_1to1(bandPtr
,0,(unsigned char *) samples_tmp
,&pnt1
);
91 *( (short *) samples
) = *tmp1
;
101 static int synth_1to1_mono2stereo(real
*bandPtr
,unsigned char *samples
,int *pnt
)
105 ret
= synth_1to1(bandPtr
,0,samples
,pnt
);
106 samples
= samples
+ *pnt
- 128;
109 ((short *)samples
)[1] = ((short *)samples
)[0];
118 static int synth_1to1_l(real
*bandPtr
,int channel
,unsigned char *out
,int *pnt
)
122 ret
= synth_1to1(bandPtr
,channel
,out
,pnt
);
123 out
= out
+ *pnt
- 128;
126 ((short *)out
)[1] = ((short *)out
)[0];
134 static int synth_1to1_r(real
*bandPtr
,int channel
,unsigned char *out
,int *pnt
)
138 ret
= synth_1to1(bandPtr
,channel
,out
,pnt
);
139 out
= out
+ *pnt
- 128;
142 ((short *)out
)[0] = ((short *)out
)[1];
150 synth_func_t synth_func
;
152 #if defined(CAN_COMPILE_X86_ASM)
153 int synth_1to1_MMX( real
*bandPtr
,int channel
,short * samples
)
155 static short buffs
[2][2][0x110];
157 synth_1to1_MMX_s(bandPtr
, channel
, samples
, (short *) buffs
, &bo
);
163 #define dct64_base(a,b,c) if(gCpuCaps.hasAltiVec) dct64_altivec(a,b,c); else dct64(a,b,c)
164 #else /* HAVE_ALTIVEC */
165 #define dct64_base(a,b,c) dct64(a,b,c)
166 #endif /* HAVE_ALTIVEC */
168 static int synth_1to1(real
*bandPtr
,int channel
,unsigned char *out
,int *pnt
)
170 static real buffs
[2][2][0x110];
171 static const int step
= 2;
173 short *samples
= (short *) (out
+ *pnt
);
174 real
*b0
,(*buf
)[0x110];
180 /* optimized for x86 */
181 #if defined(CAN_COMPILE_X86_ASM)
184 // printf("Calling %p, bandPtr=%p channel=%d samples=%p\n",synth_func,bandPtr,channel,samples);
185 // FIXME: synth_func() may destroy EBP, don't rely on stack contents!!!
186 return (*synth_func
)( bandPtr
,channel
,samples
);
189 if(!channel
) { /* channel=0 */
202 dct64_base(buf
[1]+((bo
+1)&0xf),buf
[0]+bo
,bandPtr
);
207 dct64_base(buf
[0]+bo
,buf
[1]+bo
+1,bandPtr
);
212 real
*window
= mp3lib_decwin
+ 16 - bo1
;
214 for (j
=16;j
;j
--,b0
+=0x10,window
+=0x20,samples
+=step
)
217 sum
= window
[0x0] * b0
[0x0];
218 sum
-= window
[0x1] * b0
[0x1];
219 sum
+= window
[0x2] * b0
[0x2];
220 sum
-= window
[0x3] * b0
[0x3];
221 sum
+= window
[0x4] * b0
[0x4];
222 sum
-= window
[0x5] * b0
[0x5];
223 sum
+= window
[0x6] * b0
[0x6];
224 sum
-= window
[0x7] * b0
[0x7];
225 sum
+= window
[0x8] * b0
[0x8];
226 sum
-= window
[0x9] * b0
[0x9];
227 sum
+= window
[0xA] * b0
[0xA];
228 sum
-= window
[0xB] * b0
[0xB];
229 sum
+= window
[0xC] * b0
[0xC];
230 sum
-= window
[0xD] * b0
[0xD];
231 sum
+= window
[0xE] * b0
[0xE];
232 sum
-= window
[0xF] * b0
[0xF];
234 WRITE_SAMPLE(samples
,sum
,clip
);
239 sum
= window
[0x0] * b0
[0x0];
240 sum
+= window
[0x2] * b0
[0x2];
241 sum
+= window
[0x4] * b0
[0x4];
242 sum
+= window
[0x6] * b0
[0x6];
243 sum
+= window
[0x8] * b0
[0x8];
244 sum
+= window
[0xA] * b0
[0xA];
245 sum
+= window
[0xC] * b0
[0xC];
246 sum
+= window
[0xE] * b0
[0xE];
247 WRITE_SAMPLE(samples
,sum
,clip
);
248 b0
-=0x10,window
-=0x20,samples
+=step
;
252 for (j
=15;j
;j
--,b0
-=0x10,window
-=0x20,samples
+=step
)
255 sum
= -window
[-0x1] * b0
[0x0];
256 sum
-= window
[-0x2] * b0
[0x1];
257 sum
-= window
[-0x3] * b0
[0x2];
258 sum
-= window
[-0x4] * b0
[0x3];
259 sum
-= window
[-0x5] * b0
[0x4];
260 sum
-= window
[-0x6] * b0
[0x5];
261 sum
-= window
[-0x7] * b0
[0x6];
262 sum
-= window
[-0x8] * b0
[0x7];
263 sum
-= window
[-0x9] * b0
[0x8];
264 sum
-= window
[-0xA] * b0
[0x9];
265 sum
-= window
[-0xB] * b0
[0xA];
266 sum
-= window
[-0xC] * b0
[0xB];
267 sum
-= window
[-0xD] * b0
[0xC];
268 sum
-= window
[-0xE] * b0
[0xD];
269 sum
-= window
[-0xF] * b0
[0xE];
270 sum
-= window
[-0x0] * b0
[0xF];
272 WRITE_SAMPLE(samples
,sum
,clip
);