Alc/mixer_sse.c

   1 #include "config.h"
   2
   3 #ifdef IN_IDE_PARSER
   4 /* KDevelop's parser won't recognize these defines that get added by the -msse
   5  * switch used to compile this source. Without them, xmmintrin.h fails to
   6  * declare anything. */
   7 #define __MMX__
   8 #define __SSE__
   9 #endif
  10 #include <xmmintrin.h>
  11
  12 #include "AL/al.h"
  13 #include "AL/alc.h"
  14 #include "alMain.h"
  15 #include "alu.h"
  16
  17 #include "alSource.h"
  18 #include "alAuxEffectSlot.h"
  19 #include "mixer_defs.h"
  20
  21
  22 static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
  23                                const ALuint IrSize,
  24                                ALfloat (*restrict Coeffs)[2],
  25                                ALfloat left, ALfloat right)
  26 {
  27     const __m128 lrlr = _mm_setr_ps(left, right, left, right);
  28     __m128 vals = _mm_setzero_ps();
  29     __m128 coeffs;
  30     ALuint i;
  31
  32     if((Offset&1))
  33     {
  34         const ALuint o0 = Offset&HRIR_MASK;
  35         const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
  36         __m128 imp0, imp1;
  37
  38         coeffs = _mm_load_ps(&Coeffs[0][0]);
  39         vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
  40         imp0 = _mm_mul_ps(lrlr, coeffs);
  41         vals = _mm_add_ps(imp0, vals);
  42         _mm_storel_pi((__m64*)&Values[o0][0], vals);
  43         for(i = 1;i < IrSize-1;i += 2)
  44         {
  45             const ALuint o2 = (Offset+i)&HRIR_MASK;
  46
  47             coeffs = _mm_load_ps(&Coeffs[i+1][0]);
  48             vals = _mm_load_ps(&Values[o2][0]);
  49             imp1 = _mm_mul_ps(lrlr, coeffs);
  50             imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
  51             vals = _mm_add_ps(imp0, vals);
  52             _mm_store_ps(&Values[o2][0], vals);
  53             imp0 = imp1;
  54         }
  55         vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
  56         imp0 = _mm_movehl_ps(imp0, imp0);
  57         vals = _mm_add_ps(imp0, vals);
  58         _mm_storel_pi((__m64*)&Values[o1][0], vals);
  59     }
  60     else
  61     {
  62         for(i = 0;i < IrSize;i += 2)
  63         {
  64             const ALuint o = (Offset + i)&HRIR_MASK;
  65
  66             coeffs = _mm_load_ps(&Coeffs[i][0]);
  67             vals = _mm_load_ps(&Values[o][0]);
  68             vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs));
  69             _mm_store_ps(&Values[o][0], vals);
  70         }
  71     }
  72 }
  73
  74 #define SUFFIX SSE
  75 #include "mixer_inc.c"
  76 #undef SUFFIX
  77
  78
  79 void Mix_SSE(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE],
  80              MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize)
  81 {
  82     ALfloat gain, step;
  83     __m128 gain4, step4;
  84     ALuint c;
  85
  86     for(c = 0;c < OutChans;c++)
  87     {
  88         ALuint pos = 0;
  89         gain = Gains[c].Current;
  90         step = Gains[c].Step;
  91         if(step != 1.0f && Counter > 0)
  92         {
  93             /* Mix with applying gain steps in aligned multiples of 4. */
  94             if(BufferSize-pos > 3 && Counter-pos > 3)
  95             {
  96                 gain4 = _mm_setr_ps(
  97                     gain,
  98                     gain * step,
  99                     gain * step * step,
 100                     gain * step * step * step
 101                 );
 102                 step4 = _mm_set1_ps(step * step * step * step);
 103                 do {
 104                     const __m128 val4 = _mm_load_ps(&data[pos]);
 105                     __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
 106                     dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
 107                     gain4 = _mm_mul_ps(gain4, step4);
 108                     _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
 109                     pos += 4;
 110                 } while(BufferSize-pos > 3 && Counter-pos > 3);
 111                 gain = _mm_cvtss_f32(gain4);
 112             }
 113             /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
 114             for(;pos < BufferSize && pos < Counter;pos++)
 115             {
 116                 OutBuffer[c][OutPos+pos] += data[pos]*gain;
 117                 gain *= step;
 118             }
 119             if(pos == Counter)
 120                 gain = Gains[c].Target;
 121             Gains[c].Current = gain;
 122             /* Mix until pos is aligned with 4 or the mix is done. */
 123             for(;pos < BufferSize && (pos&3) != 0;pos++)
 124                 OutBuffer[c][OutPos+pos] += data[pos]*gain;
 125         }
 126
 127         if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
 128             continue;
 129         gain4 = _mm_set1_ps(gain);
 130         for(;BufferSize-pos > 3;pos += 4)
 131         {
 132             const __m128 val4 = _mm_load_ps(&data[pos]);
 133             __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
 134             dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
 135             _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
 136         }
 137         for(;pos < BufferSize;pos++)
 138             OutBuffer[c][OutPos+pos] += data[pos]*gain;
 139     }
 140 }