Alc/mixer_sse.c

   1 #include "config.h"
   2
   3 #include <xmmintrin.h>
   4
   5 #include "AL/al.h"
   6 #include "AL/alc.h"
   7 #include "alMain.h"
   8 #include "alu.h"
   9
  10 #include "alSource.h"
  11 #include "alAuxEffectSlot.h"
  12 #include "mixer_defs.h"
  13
  14
  15 const ALfloat *Resample_bsinc32_SSE(const BsincState *state, const ALfloat *src, ALuint frac,
  16                                     ALuint increment, ALfloat *restrict dst, ALuint dstlen)
  17 {
  18     const __m128 sf4 = _mm_set1_ps(state->sf);
  19     const ALuint m = state->m;
  20     const ALint l = state->l;
  21     const ALfloat *fil, *scd, *phd, *spd;
  22     ALuint pi, j_f, i;
  23     ALfloat pf;
  24     ALint j_s;
  25     __m128 r4;
  26
  27     for(i = 0;i < dstlen;i++)
  28     {
  29         // Calculate the phase index and factor.
  30 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
  31         pi = frac >> FRAC_PHASE_BITDIFF;
  32         pf = (frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF));
  33 #undef FRAC_PHASE_BITDIFF
  34
  35         fil = state->coeffs[pi].filter;
  36         scd = state->coeffs[pi].scDelta;
  37         phd = state->coeffs[pi].phDelta;
  38         spd = state->coeffs[pi].spDelta;
  39
  40         // Apply the scale and phase interpolated filter.
  41         r4 = _mm_setzero_ps();
  42         {
  43             const __m128 pf4 = _mm_set1_ps(pf);
  44             for(j_f = 0,j_s = l;j_f < m;j_f+=4,j_s+=4)
  45             {
  46                 const __m128 f4 = _mm_add_ps(
  47                     _mm_add_ps(
  48                         _mm_load_ps(&fil[j_f]),
  49                         _mm_mul_ps(sf4, _mm_load_ps(&scd[j_f]))
  50                     ),
  51                     _mm_mul_ps(
  52                         pf4,
  53                         _mm_add_ps(
  54                             _mm_load_ps(&phd[j_f]),
  55                             _mm_mul_ps(sf4, _mm_load_ps(&spd[j_f]))
  56                         )
  57                     )
  58                 );
  59                 r4 = _mm_add_ps(r4, _mm_mul_ps(f4, _mm_loadu_ps(&src[j_s])));
  60             }
  61         }
  62         r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
  63         r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
  64         dst[i] = _mm_cvtss_f32(r4);
  65
  66         frac += increment;
  67         src  += frac>>FRACTIONBITS;
  68         frac &= FRACTIONMASK;
  69     }
  70     return dst;
  71 }
  72
  73
  74 static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
  75                                    const ALuint IrSize,
  76                                    ALfloat (*restrict Coeffs)[2],
  77                                    const ALfloat (*restrict CoeffStep)[2],
  78                                    ALfloat left, ALfloat right)
  79 {
  80     const __m128 lrlr = _mm_setr_ps(left, right, left, right);
  81     __m128 coeffs, deltas, imp0, imp1;
  82     __m128 vals = _mm_setzero_ps();
  83     ALuint i;
  84
  85     if((Offset&1))
  86     {
  87         const ALuint o0 = Offset&HRIR_MASK;
  88         const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
  89
  90         coeffs = _mm_load_ps(&Coeffs[0][0]);
  91         deltas = _mm_load_ps(&CoeffStep[0][0]);
  92         vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
  93         imp0 = _mm_mul_ps(lrlr, coeffs);
  94         coeffs = _mm_add_ps(coeffs, deltas);
  95         vals = _mm_add_ps(imp0, vals);
  96         _mm_store_ps(&Coeffs[0][0], coeffs);
  97         _mm_storel_pi((__m64*)&Values[o0][0], vals);
  98         for(i = 1;i < IrSize-1;i += 2)
  99         {
 100             const ALuint o2 = (Offset+i)&HRIR_MASK;
 101
 102             coeffs = _mm_load_ps(&Coeffs[i+1][0]);
 103             deltas = _mm_load_ps(&CoeffStep[i+1][0]);
 104             vals = _mm_load_ps(&Values[o2][0]);
 105             imp1 = _mm_mul_ps(lrlr, coeffs);
 106             coeffs = _mm_add_ps(coeffs, deltas);
 107             imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
 108             vals = _mm_add_ps(imp0, vals);
 109             _mm_store_ps(&Coeffs[i+1][0], coeffs);
 110             _mm_store_ps(&Values[o2][0], vals);
 111             imp0 = imp1;
 112         }
 113         vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
 114         imp0 = _mm_movehl_ps(imp0, imp0);
 115         vals = _mm_add_ps(imp0, vals);
 116         _mm_storel_pi((__m64*)&Values[o1][0], vals);
 117     }
 118     else
 119     {
 120         for(i = 0;i < IrSize;i += 2)
 121         {
 122             const ALuint o = (Offset + i)&HRIR_MASK;
 123
 124             coeffs = _mm_load_ps(&Coeffs[i][0]);
 125             deltas = _mm_load_ps(&CoeffStep[i][0]);
 126             vals = _mm_load_ps(&Values[o][0]);
 127             imp0 = _mm_mul_ps(lrlr, coeffs);
 128             coeffs = _mm_add_ps(coeffs, deltas);
 129             vals = _mm_add_ps(imp0, vals);
 130             _mm_store_ps(&Coeffs[i][0], coeffs);
 131             _mm_store_ps(&Values[o][0], vals);
 132         }
 133     }
 134 }
 135
 136 static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
 137                                const ALuint IrSize,
 138                                ALfloat (*restrict Coeffs)[2],
 139                                ALfloat left, ALfloat right)
 140 {
 141     const __m128 lrlr = _mm_setr_ps(left, right, left, right);
 142     __m128 vals = _mm_setzero_ps();
 143     __m128 coeffs;
 144     ALuint i;
 145
 146     if((Offset&1))
 147     {
 148         const ALuint o0 = Offset&HRIR_MASK;
 149         const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
 150         __m128 imp0, imp1;
 151
 152         coeffs = _mm_load_ps(&Coeffs[0][0]);
 153         vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
 154         imp0 = _mm_mul_ps(lrlr, coeffs);
 155         vals = _mm_add_ps(imp0, vals);
 156         _mm_storel_pi((__m64*)&Values[o0][0], vals);
 157         for(i = 1;i < IrSize-1;i += 2)
 158         {
 159             const ALuint o2 = (Offset+i)&HRIR_MASK;
 160
 161             coeffs = _mm_load_ps(&Coeffs[i+1][0]);
 162             vals = _mm_load_ps(&Values[o2][0]);
 163             imp1 = _mm_mul_ps(lrlr, coeffs);
 164             imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
 165             vals = _mm_add_ps(imp0, vals);
 166             _mm_store_ps(&Values[o2][0], vals);
 167             imp0 = imp1;
 168         }
 169         vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
 170         imp0 = _mm_movehl_ps(imp0, imp0);
 171         vals = _mm_add_ps(imp0, vals);
 172         _mm_storel_pi((__m64*)&Values[o1][0], vals);
 173     }
 174     else
 175     {
 176         for(i = 0;i < IrSize;i += 2)
 177         {
 178             const ALuint o = (Offset + i)&HRIR_MASK;
 179
 180             coeffs = _mm_load_ps(&Coeffs[i][0]);
 181             vals = _mm_load_ps(&Values[o][0]);
 182             vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs));
 183             _mm_store_ps(&Values[o][0], vals);
 184         }
 185     }
 186 }
 187
 188 #define MixHrtf MixHrtf_SSE
 189 #define MixDirectHrtf MixDirectHrtf_SSE
 190 #include "mixer_inc.c"
 191 #undef MixHrtf
 192
 193
 194 void Mix_SSE(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE],
 195              MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize)
 196 {
 197     ALfloat gain, step;
 198     __m128 gain4;
 199     ALuint c;
 200
 201     for(c = 0;c < OutChans;c++)
 202     {
 203         ALuint pos = 0;
 204         gain = Gains[c].Current;
 205         step = Gains[c].Step;
 206         if(step != 0.0f && Counter > 0)
 207         {
 208             ALuint minsize = minu(BufferSize, Counter);
 209             /* Mix with applying gain steps in aligned multiples of 4. */
 210             if(minsize-pos > 3)
 211             {
 212                 __m128 step4;
 213                 gain4 = _mm_setr_ps(
 214                     gain,
 215                     gain + step,
 216                     gain + step + step,
 217                     gain + step + step + step
 218                 );
 219                 step4 = _mm_set1_ps(step + step + step + step);
 220                 do {
 221                     const __m128 val4 = _mm_load_ps(&data[pos]);
 222                     __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
 223                     dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
 224                     gain4 = _mm_add_ps(gain4, step4);
 225                     _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
 226                     pos += 4;
 227                 } while(minsize-pos > 3);
 228                 /* NOTE: gain4 now represents the next four gains after the
 229                  * last four mixed samples, so the lowest element represents
 230                  * the next gain to apply.
 231                  */
 232                 gain = _mm_cvtss_f32(gain4);
 233             }
 234             /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
 235             for(;pos < minsize;pos++)
 236             {
 237                 OutBuffer[c][OutPos+pos] += data[pos]*gain;
 238                 gain += step;
 239             }
 240             if(pos == Counter)
 241                 gain = Gains[c].Target;
 242             Gains[c].Current = gain;
 243
 244             /* Mix until pos is aligned with 4 or the mix is done. */
 245             minsize = minu(BufferSize, (pos+3)&~3);
 246             for(;pos < minsize;pos++)
 247                 OutBuffer[c][OutPos+pos] += data[pos]*gain;
 248         }
 249
 250         if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
 251             continue;
 252         gain4 = _mm_set1_ps(gain);
 253         for(;BufferSize-pos > 3;pos += 4)
 254         {
 255             const __m128 val4 = _mm_load_ps(&data[pos]);
 256             __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
 257             dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
 258             _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
 259         }
 260         for(;pos < BufferSize;pos++)
 261             OutBuffer[c][OutPos+pos] += data[pos]*gain;
 262     }
 263 }
 264
 265 void MixRow_SSE(ALfloat *OutBuffer, const ALfloat *Gains, const ALfloat (*restrict data)[BUFFERSIZE], ALuint InChans, ALuint InPos, ALuint BufferSize)
 266 {
 267     __m128 gain4;
 268     ALuint c;
 269
 270     for(c = 0;c < InChans;c++)
 271     {
 272         ALuint pos = 0;
 273         ALfloat gain = Gains[c];
 274         if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
 275             continue;
 276
 277         gain4 = _mm_set1_ps(gain);
 278         for(;BufferSize-pos > 3;pos += 4)
 279         {
 280             const __m128 val4 = _mm_load_ps(&data[c][InPos+pos]);
 281             __m128 dry4 = _mm_load_ps(&OutBuffer[pos]);
 282             dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
 283             _mm_store_ps(&OutBuffer[pos], dry4);
 284         }
 285         for(;pos < BufferSize;pos++)
 286             OutBuffer[pos] += data[c][InPos+pos]*gain;
 287     }
 288 }