Alc/mixer_neon.c

   1 #include "config.h"
   2
   3 #include <arm_neon.h>
   4
   5 #include "AL/al.h"
   6 #include "AL/alc.h"
   7 #include "alMain.h"
   8 #include "alu.h"
   9 #include "hrtf.h"
  10
  11
  12 static inline void SetupCoeffs(ALfloat (*restrict OutCoeffs)[2],
  13                                const HrtfParams *hrtfparams,
  14                                ALuint IrSize, ALuint Counter)
  15 {
  16     ALuint c;
  17     float32x4_t counter4;
  18     {
  19         float32x2_t counter2 = vdup_n_f32(-(float)Counter);
  20         counter4 = vcombine_f32(counter2, counter2);
  21     }
  22     for(c = 0;c < IrSize;c += 2)
  23     {
  24         float32x4_t step4 = vld1q_f32((float32_t*)hrtfparams->CoeffStep[c]);
  25         float32x4_t coeffs = vld1q_f32((float32_t*)hrtfparams->Coeffs[c]);
  26         coeffs = vmlaq_f32(coeffs, step4, counter4);
  27         vst1q_f32((float32_t*)OutCoeffs[c], coeffs);
  28     }
  29 }
  30
  31 static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
  32                                    const ALuint IrSize,
  33                                    ALfloat (*restrict Coeffs)[2],
  34                                    const ALfloat (*restrict CoeffStep)[2],
  35                                    ALfloat left, ALfloat right)
  36 {
  37     ALuint c;
  38     float32x4_t leftright4;
  39     {
  40         float32x2_t leftright2 = vdup_n_f32(0.0);
  41         leftright2 = vset_lane_f32(left, leftright2, 0);
  42         leftright2 = vset_lane_f32(right, leftright2, 1);
  43         leftright4 = vcombine_f32(leftright2, leftright2);
  44     }
  45     for(c = 0;c < IrSize;c += 2)
  46     {
  47         const ALuint o0 = (Offset+c)&HRIR_MASK;
  48         const ALuint o1 = (o0+1)&HRIR_MASK;
  49         float32x4_t vals = vcombine_f32(vld1_f32((float32_t*)&Values[o0][0]),
  50                                         vld1_f32((float32_t*)&Values[o1][0]));
  51         float32x4_t coefs = vld1q_f32((float32_t*)&Coeffs[c][0]);
  52         float32x4_t deltas = vld1q_f32(&CoeffStep[c][0]);
  53
  54         vals = vmlaq_f32(vals, coefs, leftright4);
  55         coefs = vaddq_f32(coefs, deltas);
  56
  57         vst1_f32((float32_t*)&Values[o0][0], vget_low_f32(vals));
  58         vst1_f32((float32_t*)&Values[o1][0], vget_high_f32(vals));
  59         vst1q_f32(&Coeffs[c][0], coefs);
  60     }
  61 }
  62
  63 static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
  64                                const ALuint IrSize,
  65                                ALfloat (*restrict Coeffs)[2],
  66                                ALfloat left, ALfloat right)
  67 {
  68     ALuint c;
  69     float32x4_t leftright4;
  70     {
  71         float32x2_t leftright2 = vdup_n_f32(0.0);
  72         leftright2 = vset_lane_f32(left, leftright2, 0);
  73         leftright2 = vset_lane_f32(right, leftright2, 1);
  74         leftright4 = vcombine_f32(leftright2, leftright2);
  75     }
  76     for(c = 0;c < IrSize;c += 2)
  77     {
  78         const ALuint o0 = (Offset+c)&HRIR_MASK;
  79         const ALuint o1 = (o0+1)&HRIR_MASK;
  80         float32x4_t vals = vcombine_f32(vld1_f32((float32_t*)&Values[o0][0]),
  81                                         vld1_f32((float32_t*)&Values[o1][0]));
  82         float32x4_t coefs = vld1q_f32((float32_t*)&Coeffs[c][0]);
  83
  84         vals = vmlaq_f32(vals, coefs, leftright4);
  85
  86         vst1_f32((float32_t*)&Values[o0][0], vget_low_f32(vals));
  87         vst1_f32((float32_t*)&Values[o1][0], vget_high_f32(vals));
  88     }
  89 }
  90
  91
  92 #define SUFFIX Neon
  93 #include "mixer_inc.c"
  94 #undef SUFFIX
  95
  96
  97 void Mix_Neon(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE],
  98               MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize)
  99 {
 100     ALfloat gain, step;
 101     float32x4_t gain4;
 102     ALuint c;
 103
 104     for(c = 0;c < OutChans;c++)
 105     {
 106         ALuint pos = 0;
 107         gain = Gains[c].Current;
 108         step = Gains[c].Step;
 109         if(step != 0.0f && Counter > 0)
 110         {
 111             for(;pos < BufferSize && pos < Counter;pos++)
 112             {
 113                 OutBuffer[c][OutPos+pos] += data[pos]*gain;
 114                 gain += step;
 115             }
 116             if(pos == Counter)
 117                 gain = Gains[c].Target;
 118             Gains[c].Current = gain;
 119             /* Mix until pos is aligned with 4 or the mix is done. */
 120             for(;pos < BufferSize && (pos&3) != 0;pos++)
 121                 OutBuffer[c][OutPos+pos] += data[pos]*gain;
 122         }
 123
 124         if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
 125             continue;
 126         gain4 = vdupq_n_f32(gain);
 127         for(;BufferSize-pos > 3;pos += 4)
 128         {
 129             const float32x4_t val4 = vld1q_f32(&data[pos]);
 130             float32x4_t dry4 = vld1q_f32(&OutBuffer[c][OutPos+pos]);
 131             dry4 = vaddq_f32(dry4, vmulq_f32(val4, gain4));
 132             vst1q_f32(&OutBuffer[c][OutPos+pos], dry4);
 133         }
 134         for(;pos < BufferSize;pos++)
 135             OutBuffer[c][OutPos+pos] += data[pos]*gain;
 136     }
 137 }