Alc/mixer_neon.c

   1 #include "config.h"
   2
   3 #ifdef HAVE_ARM_NEON_H
   4 #include <arm_neon.h>
   5 #endif
   6
   7 #include "AL/al.h"
   8 #include "AL/alc.h"
   9 #include "alMain.h"
  10 #include "alu.h"
  11 #include "hrtf.h"
  12
  13
  14 static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
  15                                    const ALuint IrSize,
  16                                    ALfloat (*restrict Coeffs)[2],
  17                                    const ALfloat (*restrict CoeffStep)[2],
  18                                    ALfloat left, ALfloat right)
  19 {
  20     ALuint c;
  21     float32x4_t leftright4;
  22     {
  23         float32x2_t leftright2 = vdup_n_f32(0.0);
  24         leftright2 = vset_lane_f32(left, leftright2, 0);
  25         leftright2 = vset_lane_f32(right, leftright2, 1);
  26         leftright4 = vcombine_f32(leftright2, leftright2);
  27     }
  28     for(c = 0;c < IrSize;c += 2)
  29     {
  30         const ALuint o0 = (Offset+c)&HRIR_MASK;
  31         const ALuint o1 = (o0+1)&HRIR_MASK;
  32         float32x4_t vals = vcombine_f32(vld1_f32((float32_t*)&Values[o0][0]),
  33                                         vld1_f32((float32_t*)&Values[o1][0]));
  34         float32x4_t coefs = vld1q_f32((float32_t*)&Coeffs[c][0]);
  35         float32x4_t deltas = vld1q_f32(&CoeffStep[c][0]);
  36
  37         vals = vmlaq_f32(vals, coefs, leftright4);
  38         coefs = vaddq_f32(coefs, deltas);
  39
  40         vst1_f32((float32_t*)&Values[o0][0], vget_low_f32(vals));
  41         vst1_f32((float32_t*)&Values[o1][0], vget_high_f32(vals));
  42         vst1q_f32(&Coeffs[c][0], coefs);
  43     }
  44 }
  45
  46 static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
  47                                const ALuint IrSize,
  48                                ALfloat (*restrict Coeffs)[2],
  49                                ALfloat left, ALfloat right)
  50 {
  51     ALuint c;
  52     float32x4_t leftright4;
  53     {
  54         float32x2_t leftright2 = vdup_n_f32(0.0);
  55         leftright2 = vset_lane_f32(left, leftright2, 0);
  56         leftright2 = vset_lane_f32(right, leftright2, 1);
  57         leftright4 = vcombine_f32(leftright2, leftright2);
  58     }
  59     for(c = 0;c < IrSize;c += 2)
  60     {
  61         const ALuint o0 = (Offset+c)&HRIR_MASK;
  62         const ALuint o1 = (o0+1)&HRIR_MASK;
  63         float32x4_t vals = vcombine_f32(vld1_f32((float32_t*)&Values[o0][0]),
  64                                         vld1_f32((float32_t*)&Values[o1][0]));
  65         float32x4_t coefs = vld1q_f32((float32_t*)&Coeffs[c][0]);
  66
  67         vals = vmlaq_f32(vals, coefs, leftright4);
  68
  69         vst1_f32((float32_t*)&Values[o0][0], vget_low_f32(vals));
  70         vst1_f32((float32_t*)&Values[o1][0], vget_high_f32(vals));
  71     }
  72 }
  73
  74
  75 #define SUFFIX Neon
  76 #include "mixer_inc.c"
  77 #undef SUFFIX
  78
  79
  80 void MixDirect_Neon(DirectParams *params, const ALfloat *restrict data, ALuint srcchan,
  81   ALuint OutPos, ALuint BufferSize)
  82 {
  83     ALfloat (*restrict OutBuffer)[BUFFERSIZE] = params->OutBuffer;
  84     ALuint Counter = maxu(params->Counter, OutPos) - OutPos;
  85     ALfloat DrySend, Step;
  86     float32x4_t gain;
  87     ALuint c;
  88
  89     for(c = 0;c < MaxChannels;c++)
  90     {
  91         ALuint pos = 0;
  92         Step = params->Mix.Gains.Step[srcchan][c];
  93         if(Step != 1.0f && Counter > 0)
  94         {
  95             DrySend = params->Mix.Gains.Current[srcchan][c];
  96             for(;BufferSize-pos > 3 && Counter-pos > 3;pos+=4)
  97             {
  98                 OutBuffer[c][OutPos+pos  ] += data[pos  ]*DrySend;
  99                 DrySend *= Step;
 100                 OutBuffer[c][OutPos+pos+1] += data[pos+1]*DrySend;
 101                 DrySend *= Step;
 102                 OutBuffer[c][OutPos+pos+2] += data[pos+2]*DrySend;
 103                 DrySend *= Step;
 104                 OutBuffer[c][OutPos+pos+4] += data[pos+3]*DrySend;
 105                 DrySend *= Step;
 106             }
 107             if(!(BufferSize-pos > 3))
 108             {
 109                 for(;pos < BufferSize && pos < Counter;pos++)
 110                 {
 111                     OutBuffer[c][OutPos+pos] += data[pos]*DrySend;
 112                     DrySend *= Step;
 113                 }
 114             }
 115             params->Mix.Gains.Current[srcchan][c] = DrySend;
 116         }
 117
 118         DrySend = params->Mix.Gains.Target[srcchan][c];
 119         if(!(DrySend > GAIN_SILENCE_THRESHOLD))
 120             continue;
 121         gain = vdupq_n_f32(DrySend);
 122         for(;BufferSize-pos > 3;pos += 4)
 123         {
 124             const float32x4_t val4 = vld1q_f32(&data[pos]);
 125             float32x4_t dry4 = vld1q_f32(&OutBuffer[c][OutPos+pos]);
 126             dry4 = vaddq_f32(dry4, vmulq_f32(val4, gain));
 127             vst1q_f32(&OutBuffer[c][OutPos+pos], dry4);
 128         }
 129         for(;pos < BufferSize;pos++)
 130             OutBuffer[c][OutPos+pos] += data[pos]*DrySend;
 131     }
 132 }
 133
 134
 135 void MixSend_Neon(SendParams *params, const ALfloat *restrict data,
 136   ALuint OutPos, ALuint UNUSED(SamplesToDo), ALuint BufferSize)
 137 {
 138     ALfloat (*restrict OutBuffer)[BUFFERSIZE] = params->OutBuffer;
 139     ALuint Counter = maxu(params->Counter, OutPos) - OutPos;
 140     ALfloat WetGain, Step;
 141     float32x4_t gain;
 142
 143     {
 144         ALuint pos = 0;
 145         Step = params->Gain.Step;
 146         if(Step != 1.0f && Counter > 0)
 147         {
 148             WetGain = params->Gain.Current;
 149             for(;BufferSize-pos > 3 && Counter-pos > 3;pos+=4)
 150             {
 151                 OutBuffer[0][OutPos+pos  ] += data[pos  ]*WetGain;
 152                 WetGain *= Step;
 153                 OutBuffer[0][OutPos+pos+1] += data[pos+1]*WetGain;
 154                 WetGain *= Step;
 155                 OutBuffer[0][OutPos+pos+2] += data[pos+2]*WetGain;
 156                 WetGain *= Step;
 157                 OutBuffer[0][OutPos+pos+4] += data[pos+3]*WetGain;
 158                 WetGain *= Step;
 159             }
 160             if(!(BufferSize-pos > 3))
 161             {
 162                 for(;pos < BufferSize && pos < Counter;pos++)
 163                 {
 164                     OutBuffer[0][OutPos+pos] += data[pos]*WetGain;
 165                     WetGain *= Step;
 166                 }
 167             }
 168             params->Gain.Current = WetGain;
 169         }
 170
 171         WetGain = params->Gain.Target;
 172         if(!(WetGain > GAIN_SILENCE_THRESHOLD))
 173             return;
 174         gain = vdupq_n_f32(WetGain);
 175         for(;BufferSize-pos > 3;pos += 4)
 176         {
 177             const float32x4_t val4 = vld1q_f32(&data[pos]);
 178             float32x4_t wet4 = vld1q_f32(&OutBuffer[0][OutPos+pos]);
 179             wet4 = vaddq_f32(wet4, vmulq_f32(val4, gain));
 180             vst1q_f32(&OutBuffer[0][OutPos+pos], wet4);
 181         }
 182         for(;pos < BufferSize;pos++)
 183             OutBuffer[0][OutPos+pos] += data[pos] * WetGain;
 184     }
 185 }