mp3lib/decod386.c

   1 /*
   2  * Mpeg Layer-1,2,3 audio decoder
   3  * ------------------------------
   4  * copyright (c) 1995,1996,1997 by Michael Hipp, All rights reserved.
   5  * See also 'README'
   6  *
   7  * slighlty optimized for machines without autoincrement/decrement.
   8  * The performance is highly compiler dependend. Maybe
   9  * the decode.c version for 'normal' processor may be faster
  10  * even for Intel processors.
  11  */
  12
  13
  14 #include "../config.h"
  15
  16 #if 0
  17  /* old WRITE_SAMPLE */
  18    /* is portable */
  19 #define WRITE_SAMPLE(samples,sum,clip) {                        \
  20   if( (sum) > 32767.0) { *(samples) = 0x7fff; (clip)++; }       \
  21   else if( (sum) < -32768.0) { *(samples) = -0x8000; (clip)++; }\
  22   else { *(samples) = sum;  }                                   \
  23 }
  24 #else
  25  /* new WRITE_SAMPLE */
  26
  27 /*
  28  * should be the same as the "old WRITE_SAMPLE" macro above, but uses
  29  * some tricks to avoid double->int conversions and floating point compares.
  30  *
  31  * Here's how it works:
  32  * ((((65536.0 * 65536.0 * 16)+(65536.0 * 0.5))* 65536.0)) is
  33  * 0x0010000080000000LL in hex.  It computes 0x0010000080000000LL + sum
  34  * as a double IEEE fp value and extracts the low-order 32-bits from the
  35  * IEEE fp representation stored in memory.  The 2^56 bit in the constant
  36  * is intended to force the bits of "sum" into the least significant bits
  37  * of the double mantissa.  After an integer substraction of 0x80000000
  38  * we have the original double value "sum" converted to an 32-bit int value.
  39  *
  40  * (Is that really faster than the clean and simple old version of the macro?)
  41  */
  42
  43 /*
  44  * On a SPARC cpu, we fetch the low-order 32-bit from the second 32-bit
  45  * word of the double fp value stored in memory.  On an x86 cpu, we fetch it
  46  * from the first 32-bit word.
  47  * I'm not sure if the WORDS_BIGENDIAN feature test covers all possible memory
  48  * layouts of double floating point values an all cpu architectures.  If
  49  * it doesn't work for you, just enable the "old WRITE_SAMPLE" macro.
  50  */
  51 #if WORDS_BIGENDIAN
  52 #define MANTISSA_OFFSET 1
  53 #else
  54 #define MANTISSA_OFFSET 0
  55 #endif
  56
  57    /* sizeof(int) == 4 */
  58 #define WRITE_SAMPLE(samples,sum,clip) { \
  59   union { double dtemp; int itemp[2]; } u; int v; \
  60   u.dtemp = ((((65536.0 * 65536.0 * 16)+(65536.0 * 0.5))* 65536.0)) + (sum);\
  61   v = u.itemp[MANTISSA_OFFSET] - 0x80000000; \
  62   if( v > 32767) { *(samples) = 0x7fff; (clip)++; } \
  63   else if( v < -32768) { *(samples) = -0x8000; (clip)++; } \
  64   else { *(samples) = v; } \
  65 }
  66 #endif
  67
  68
  69 /*
  70 #define WRITE_SAMPLE(samples,sum,clip) { \
  71   double dtemp; int v;                    \
  72   dtemp = ((((65536.0 * 65536.0 * 16)+(65536.0 * 0.5))* 65536.0)) + (sum);\
  73   v = ((*(int *)&dtemp) - 0x80000000); \
  74   if( v > 32767) { *(samples) = 0x7fff; (clip)++; } \
  75   else if( v < -32768) { *(samples) = -0x8000; (clip)++; } \
  76   else { *(samples) = v; } \
  77 }
  78 */
  79
  80 static int synth_1to1_mono(real *bandPtr,unsigned char *samples,int *pnt)
  81 {
  82   short samples_tmp[64];
  83   short *tmp1 = samples_tmp;
  84   int i,ret;
  85   int pnt1 = 0;
  86
  87   ret = synth_1to1(bandPtr,0,(unsigned char *) samples_tmp,&pnt1);
  88   samples += *pnt;
  89
  90   for(i=0;i<32;i++) {
  91     *( (short *) samples) = *tmp1;
  92     samples += 2;
  93     tmp1 += 2;
  94   }
  95   *pnt += 64;
  96
  97   return ret;
  98 }
  99
 100
 101 static int synth_1to1_mono2stereo(real *bandPtr,unsigned char *samples,int *pnt)
 102 {
 103   int i,ret;
 104
 105   ret = synth_1to1(bandPtr,0,samples,pnt);
 106   samples = samples + *pnt - 128;
 107
 108   for(i=0;i<32;i++) {
 109     ((short *)samples)[1] = ((short *)samples)[0];
 110     samples+=4;
 111   }
 112
 113   return ret;
 114 }
 115
 116
 117 #ifdef USE_FAKE_MONO
 118 static int synth_1to1_l(real *bandPtr,int channel,unsigned char *out,int *pnt)
 119 {
 120   int i,ret;
 121
 122   ret = synth_1to1(bandPtr,channel,out,pnt);
 123   out = out + *pnt - 128;
 124
 125   for(i=0;i<32;i++) {
 126     ((short *)out)[1] = ((short *)out)[0];
 127     out+=4;
 128   }
 129
 130   return ret;
 131 }
 132
 133
 134 static int synth_1to1_r(real *bandPtr,int channel,unsigned char *out,int *pnt)
 135 {
 136   int i,ret;
 137
 138   ret = synth_1to1(bandPtr,channel,out,pnt);
 139   out = out + *pnt - 128;
 140
 141   for(i=0;i<32;i++) {
 142     ((short *)out)[0] = ((short *)out)[1];
 143     out+=4;
 144   }
 145
 146   return ret;
 147 }
 148 #endif
 149
 150 synth_func_t synth_func;
 151
 152 #if defined(CAN_COMPILE_X86_ASM)
 153 int synth_1to1_MMX( real *bandPtr,int channel,short * samples)
 154 {
 155     static short buffs[2][2][0x110];
 156     static int bo = 1;
 157     synth_1to1_MMX_s(bandPtr, channel, samples, (short *) buffs, &bo);
 158     return 0;
 159 }
 160 #endif
 161
 162 #ifdef HAVE_ALTIVEC
 163 #define dct64_base(a,b,c) if(gCpuCaps.hasAltiVec) dct64_altivec(a,b,c); else dct64(a,b,c)
 164 #else /* HAVE_ALTIVEC */
 165 #define dct64_base(a,b,c) dct64(a,b,c)
 166 #endif /* HAVE_ALTIVEC */
 167
 168 static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt)
 169 {
 170   static real buffs[2][2][0x110];
 171   static const int step = 2;
 172   static int bo = 1;
 173   short *samples = (short *) (out + *pnt);
 174   real *b0,(*buf)[0x110];
 175   int clip = 0;
 176   int bo1;
 177
 178   *pnt += 128;
 179
 180 /* optimized for x86 */
 181 #if defined(CAN_COMPILE_X86_ASM)
 182   if ( synth_func )
 183    {
 184 //    printf("Calling %p, bandPtr=%p channel=%d samples=%p\n",synth_func,bandPtr,channel,samples);
 185     // FIXME: synth_func() may destroy EBP, don't rely on stack contents!!!
 186     return (*synth_func)( bandPtr,channel,samples);
 187    }
 188 #endif
 189   if(!channel) {     /* channel=0 */
 190     bo--;
 191     bo &= 0xf;
 192     buf = buffs[0];
 193   }
 194   else {
 195     samples++;
 196     buf = buffs[1];
 197   }
 198
 199   if(bo & 0x1) {
 200     b0 = buf[0];
 201     bo1 = bo;
 202     dct64_base(buf[1]+((bo+1)&0xf),buf[0]+bo,bandPtr);
 203   }
 204   else {
 205     b0 = buf[1];
 206     bo1 = bo+1;
 207     dct64_base(buf[0]+bo,buf[1]+bo+1,bandPtr);
 208   }
 209
 210   {
 211     register int j;
 212     real *window = mp3lib_decwin + 16 - bo1;
 213
 214     for (j=16;j;j--,b0+=0x10,window+=0x20,samples+=step)
 215     {
 216       real sum;
 217       sum  = window[0x0] * b0[0x0];
 218       sum -= window[0x1] * b0[0x1];
 219       sum += window[0x2] * b0[0x2];
 220       sum -= window[0x3] * b0[0x3];
 221       sum += window[0x4] * b0[0x4];
 222       sum -= window[0x5] * b0[0x5];
 223       sum += window[0x6] * b0[0x6];
 224       sum -= window[0x7] * b0[0x7];
 225       sum += window[0x8] * b0[0x8];
 226       sum -= window[0x9] * b0[0x9];
 227       sum += window[0xA] * b0[0xA];
 228       sum -= window[0xB] * b0[0xB];
 229       sum += window[0xC] * b0[0xC];
 230       sum -= window[0xD] * b0[0xD];
 231       sum += window[0xE] * b0[0xE];
 232       sum -= window[0xF] * b0[0xF];
 233
 234       WRITE_SAMPLE(samples,sum,clip);
 235     }
 236
 237     {
 238       real sum;
 239       sum  = window[0x0] * b0[0x0];
 240       sum += window[0x2] * b0[0x2];
 241       sum += window[0x4] * b0[0x4];
 242       sum += window[0x6] * b0[0x6];
 243       sum += window[0x8] * b0[0x8];
 244       sum += window[0xA] * b0[0xA];
 245       sum += window[0xC] * b0[0xC];
 246       sum += window[0xE] * b0[0xE];
 247       WRITE_SAMPLE(samples,sum,clip);
 248       b0-=0x10,window-=0x20,samples+=step;
 249     }
 250     window += bo1<<1;
 251
 252     for (j=15;j;j--,b0-=0x10,window-=0x20,samples+=step)
 253     {
 254       real sum;
 255       sum = -window[-0x1] * b0[0x0];
 256       sum -= window[-0x2] * b0[0x1];
 257       sum -= window[-0x3] * b0[0x2];
 258       sum -= window[-0x4] * b0[0x3];
 259       sum -= window[-0x5] * b0[0x4];
 260       sum -= window[-0x6] * b0[0x5];
 261       sum -= window[-0x7] * b0[0x6];
 262       sum -= window[-0x8] * b0[0x7];
 263       sum -= window[-0x9] * b0[0x8];
 264       sum -= window[-0xA] * b0[0x9];
 265       sum -= window[-0xB] * b0[0xA];
 266       sum -= window[-0xC] * b0[0xB];
 267       sum -= window[-0xD] * b0[0xC];
 268       sum -= window[-0xE] * b0[0xD];
 269       sum -= window[-0xF] * b0[0xE];
 270       sum -= window[-0x0] * b0[0xF];
 271
 272       WRITE_SAMPLE(samples,sum,clip);
 273     }
 274   }
 275
 276   return clip;
 277
 278 }
 279