mp3lib/decod386.c

   1 /*
   2  * Modified for use with MPlayer, for details see the CVS changelog at
   3  * http://www.mplayerhq.hu/cgi-bin/cvsweb.cgi/main/
   4  * $Id$
   5  */
   6
   7 /*
   8  * Mpeg Layer-1,2,3 audio decoder
   9  * ------------------------------
  10  * copyright (c) 1995,1996,1997 by Michael Hipp, All rights reserved.
  11  * See also 'README'
  12  *
  13  * slighlty optimized for machines without autoincrement/decrement.
  14  * The performance is highly compiler dependend. Maybe
  15  * the decode.c version for 'normal' processor may be faster
  16  * even for Intel processors.
  17  */
  18
  19
  20 #include "config.h"
  21
  22 #if 0
  23  /* old WRITE_SAMPLE */
  24    /* is portable */
  25 #define WRITE_SAMPLE(samples,sum,clip) {                        \
  26   if( (sum) > 32767.0) { *(samples) = 0x7fff; (clip)++; }       \
  27   else if( (sum) < -32768.0) { *(samples) = -0x8000; (clip)++; }\
  28   else { *(samples) = sum;  }                                   \
  29 }
  30 #else
  31  /* new WRITE_SAMPLE */
  32
  33 /*
  34  * should be the same as the "old WRITE_SAMPLE" macro above, but uses
  35  * some tricks to avoid double->int conversions and floating point compares.
  36  *
  37  * Here's how it works:
  38  * ((((65536.0 * 65536.0 * 16)+(65536.0 * 0.5))* 65536.0)) is
  39  * 0x0010000080000000LL in hex.  It computes 0x0010000080000000LL + sum
  40  * as a double IEEE fp value and extracts the low-order 32-bits from the
  41  * IEEE fp representation stored in memory.  The 2^56 bit in the constant
  42  * is intended to force the bits of "sum" into the least significant bits
  43  * of the double mantissa.  After an integer substraction of 0x80000000
  44  * we have the original double value "sum" converted to an 32-bit int value.
  45  *
  46  * (Is that really faster than the clean and simple old version of the macro?)
  47  */
  48
  49 /*
  50  * On a SPARC cpu, we fetch the low-order 32-bit from the second 32-bit
  51  * word of the double fp value stored in memory.  On an x86 cpu, we fetch it
  52  * from the first 32-bit word.
  53  * I'm not sure if the WORDS_BIGENDIAN feature test covers all possible memory
  54  * layouts of double floating point values an all cpu architectures.  If
  55  * it doesn't work for you, just enable the "old WRITE_SAMPLE" macro.
  56  */
  57 #if WORDS_BIGENDIAN
  58 #define MANTISSA_OFFSET 1
  59 #else
  60 #define MANTISSA_OFFSET 0
  61 #endif
  62
  63    /* sizeof(int) == 4 */
  64 #define WRITE_SAMPLE(samples,sum,clip) { \
  65   union { double dtemp; int itemp[2]; } u; int v; \
  66   u.dtemp = ((((65536.0 * 65536.0 * 16)+(65536.0 * 0.5))* 65536.0)) + (sum);\
  67   v = u.itemp[MANTISSA_OFFSET] - 0x80000000; \
  68   if( v > 32767) { *(samples) = 0x7fff; (clip)++; } \
  69   else if( v < -32768) { *(samples) = -0x8000; (clip)++; } \
  70   else { *(samples) = v; } \
  71 }
  72 #endif
  73
  74
  75 /*
  76 #define WRITE_SAMPLE(samples,sum,clip) { \
  77   double dtemp; int v;                    \
  78   dtemp = ((((65536.0 * 65536.0 * 16)+(65536.0 * 0.5))* 65536.0)) + (sum);\
  79   v = ((*(int *)&dtemp) - 0x80000000); \
  80   if( v > 32767) { *(samples) = 0x7fff; (clip)++; } \
  81   else if( v < -32768) { *(samples) = -0x8000; (clip)++; } \
  82   else { *(samples) = v; } \
  83 }
  84 */
  85
  86 static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt);
  87
  88 static int synth_1to1_mono(real *bandPtr,unsigned char *samples,int *pnt)
  89 {
  90   short samples_tmp[64];
  91   short *tmp1 = samples_tmp;
  92   int i,ret;
  93   int pnt1 = 0;
  94
  95   ret = synth_1to1(bandPtr,0,(unsigned char *) samples_tmp,&pnt1);
  96   samples += *pnt;
  97
  98   for(i=0;i<32;i++) {
  99     *( (short *) samples) = *tmp1;
 100     samples += 2;
 101     tmp1 += 2;
 102   }
 103   *pnt += 64;
 104
 105   return ret;
 106 }
 107
 108
 109 static int synth_1to1_mono2stereo(real *bandPtr,unsigned char *samples,int *pnt)
 110 {
 111   int i,ret;
 112
 113   ret = synth_1to1(bandPtr,0,samples,pnt);
 114   samples = samples + *pnt - 128;
 115
 116   for(i=0;i<32;i++) {
 117     ((short *)samples)[1] = ((short *)samples)[0];
 118     samples+=4;
 119   }
 120
 121   return ret;
 122 }
 123
 124 static synth_func_t synth_func;
 125
 126 #if defined(CAN_COMPILE_X86_ASM)
 127 int synth_1to1_MMX( real *bandPtr,int channel,short * samples)
 128 {
 129     static short buffs[2][2][0x110];
 130     static int bo = 1;
 131     synth_1to1_MMX_s(bandPtr, channel, samples, (short *) buffs, &bo);
 132     return 0;
 133 }
 134 #endif
 135
 136 #ifdef HAVE_ALTIVEC
 137 #define dct64_base(a,b,c) if(gCpuCaps.hasAltiVec) dct64_altivec(a,b,c); else dct64(a,b,c)
 138 #else /* HAVE_ALTIVEC */
 139 #define dct64_base(a,b,c) dct64(a,b,c)
 140 #endif /* HAVE_ALTIVEC */
 141
 142 static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt)
 143 {
 144   static real buffs[2][2][0x110];
 145   static const int step = 2;
 146   static int bo = 1;
 147   short *samples = (short *) (out + *pnt);
 148   real *b0,(*buf)[0x110];
 149   int clip = 0;
 150   int bo1;
 151
 152   *pnt += 128;
 153
 154 /* optimized for x86 */
 155 #if defined(CAN_COMPILE_X86_ASM)
 156   if ( synth_func )
 157    {
 158 //    printf("Calling %p, bandPtr=%p channel=%d samples=%p\n",synth_func,bandPtr,channel,samples);
 159     // FIXME: synth_func() may destroy EBP, don't rely on stack contents!!!
 160     return (*synth_func)( bandPtr,channel,samples);
 161    }
 162 #endif
 163   if(!channel) {     /* channel=0 */
 164     bo--;
 165     bo &= 0xf;
 166     buf = buffs[0];
 167   }
 168   else {
 169     samples++;
 170     buf = buffs[1];
 171   }
 172
 173   if(bo & 0x1) {
 174     b0 = buf[0];
 175     bo1 = bo;
 176     dct64_base(buf[1]+((bo+1)&0xf),buf[0]+bo,bandPtr);
 177   }
 178   else {
 179     b0 = buf[1];
 180     bo1 = bo+1;
 181     dct64_base(buf[0]+bo,buf[1]+bo+1,bandPtr);
 182   }
 183
 184   {
 185     register int j;
 186     real *window = mp3lib_decwin + 16 - bo1;
 187
 188     for (j=16;j;j--,b0+=0x10,window+=0x20,samples+=step)
 189     {
 190       real sum;
 191       sum  = window[0x0] * b0[0x0];
 192       sum -= window[0x1] * b0[0x1];
 193       sum += window[0x2] * b0[0x2];
 194       sum -= window[0x3] * b0[0x3];
 195       sum += window[0x4] * b0[0x4];
 196       sum -= window[0x5] * b0[0x5];
 197       sum += window[0x6] * b0[0x6];
 198       sum -= window[0x7] * b0[0x7];
 199       sum += window[0x8] * b0[0x8];
 200       sum -= window[0x9] * b0[0x9];
 201       sum += window[0xA] * b0[0xA];
 202       sum -= window[0xB] * b0[0xB];
 203       sum += window[0xC] * b0[0xC];
 204       sum -= window[0xD] * b0[0xD];
 205       sum += window[0xE] * b0[0xE];
 206       sum -= window[0xF] * b0[0xF];
 207
 208       WRITE_SAMPLE(samples,sum,clip);
 209     }
 210
 211     {
 212       real sum;
 213       sum  = window[0x0] * b0[0x0];
 214       sum += window[0x2] * b0[0x2];
 215       sum += window[0x4] * b0[0x4];
 216       sum += window[0x6] * b0[0x6];
 217       sum += window[0x8] * b0[0x8];
 218       sum += window[0xA] * b0[0xA];
 219       sum += window[0xC] * b0[0xC];
 220       sum += window[0xE] * b0[0xE];
 221       WRITE_SAMPLE(samples,sum,clip);
 222       b0-=0x10,window-=0x20,samples+=step;
 223     }
 224     window += bo1<<1;
 225
 226     for (j=15;j;j--,b0-=0x10,window-=0x20,samples+=step)
 227     {
 228       real sum;
 229       sum = -window[-0x1] * b0[0x0];
 230       sum -= window[-0x2] * b0[0x1];
 231       sum -= window[-0x3] * b0[0x2];
 232       sum -= window[-0x4] * b0[0x3];
 233       sum -= window[-0x5] * b0[0x4];
 234       sum -= window[-0x6] * b0[0x5];
 235       sum -= window[-0x7] * b0[0x6];
 236       sum -= window[-0x8] * b0[0x7];
 237       sum -= window[-0x9] * b0[0x8];
 238       sum -= window[-0xA] * b0[0x9];
 239       sum -= window[-0xB] * b0[0xA];
 240       sum -= window[-0xC] * b0[0xB];
 241       sum -= window[-0xD] * b0[0xC];
 242       sum -= window[-0xE] * b0[0xD];
 243       sum -= window[-0xF] * b0[0xE];
 244       sum -= window[-0x0] * b0[0xF];
 245
 246       WRITE_SAMPLE(samples,sum,clip);
 247     }
 248   }
 249
 250   return clip;
 251
 252 }
 253
 254 #ifdef USE_FAKE_MONO
 255 static int synth_1to1_l(real *bandPtr,int channel,unsigned char *out,int *pnt)
 256 {
 257   int i,ret;
 258
 259   ret = synth_1to1(bandPtr,channel,out,pnt);
 260   out = out + *pnt - 128;
 261
 262   for(i=0;i<32;i++) {
 263     ((short *)out)[1] = ((short *)out)[0];
 264     out+=4;
 265   }
 266
 267   return ret;
 268 }
 269
 270 static int synth_1to1_r(real *bandPtr,int channel,unsigned char *out,int *pnt)
 271 {
 272   int i,ret;
 273
 274   ret = synth_1to1(bandPtr,channel,out,pnt);
 275   out = out + *pnt - 128;
 276
 277   for(i=0;i<32;i++) {
 278     ((short *)out)[0] = ((short *)out)[1];
 279     out+=4;
 280   }
 281
 282   return ret;
 283 }
 284 #endif