Alc/mixer_neon.c

   1 #include "config.h"
   2
   3 #ifdef HAVE_ARM_NEON_H
   4 #include <arm_neon.h>
   5 #endif
   6
   7 #include "AL/al.h"
   8 #include "AL/alc.h"
   9 #include "alMain.h"
  10 #include "alu.h"
  11 #include "hrtf.h"
  12
  13
  14 static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
  15                                    const ALuint IrSize,
  16                                    ALfloat (*restrict Coeffs)[2],
  17                                    const ALfloat (*restrict CoeffStep)[2],
  18                                    ALfloat left, ALfloat right)
  19 {
  20     ALuint c;
  21     float32x4_t leftright4;
  22     {
  23         float32x2_t leftright2 = vdup_n_f32(0.0);
  24         leftright2 = vset_lane_f32(left, leftright2, 0);
  25         leftright2 = vset_lane_f32(right, leftright2, 1);
  26         leftright4 = vcombine_f32(leftright2, leftright2);
  27     }
  28     for(c = 0;c < IrSize;c += 2)
  29     {
  30         const ALuint o0 = (Offset+c)&HRIR_MASK;
  31         const ALuint o1 = (o0+1)&HRIR_MASK;
  32         float32x4_t vals = vcombine_f32(vld1_f32((float32_t*)&Values[o0][0]),
  33                                         vld1_f32((float32_t*)&Values[o1][0]));
  34         float32x4_t coefs = vld1q_f32((float32_t*)&Coeffs[c][0]);
  35         float32x4_t deltas = vld1q_f32(&CoeffStep[c][0]);
  36
  37         vals = vmlaq_f32(vals, coefs, leftright4);
  38         coefs = vaddq_f32(coefs, deltas);
  39
  40         vst1_f32((float32_t*)&Values[o0][0], vget_low_f32(vals));
  41         vst1_f32((float32_t*)&Values[o1][0], vget_high_f32(vals));
  42         vst1q_f32(&Coeffs[c][0], coefs);
  43     }
  44 }
  45
  46 static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
  47                                const ALuint IrSize,
  48                                ALfloat (*restrict Coeffs)[2],
  49                                ALfloat left, ALfloat right)
  50 {
  51     ALuint c;
  52     float32x4_t leftright4;
  53     {
  54         float32x2_t leftright2 = vdup_n_f32(0.0);
  55         leftright2 = vset_lane_f32(left, leftright2, 0);
  56         leftright2 = vset_lane_f32(right, leftright2, 1);
  57         leftright4 = vcombine_f32(leftright2, leftright2);
  58     }
  59     for(c = 0;c < IrSize;c += 2)
  60     {
  61         const ALuint o0 = (Offset+c)&HRIR_MASK;
  62         const ALuint o1 = (o0+1)&HRIR_MASK;
  63         float32x4_t vals = vcombine_f32(vld1_f32((float32_t*)&Values[o0][0]),
  64                                         vld1_f32((float32_t*)&Values[o1][0]));
  65         float32x4_t coefs = vld1q_f32((float32_t*)&Coeffs[c][0]);
  66
  67         vals = vmlaq_f32(vals, coefs, leftright4);
  68
  69         vst1_f32((float32_t*)&Values[o0][0], vget_low_f32(vals));
  70         vst1_f32((float32_t*)&Values[o1][0], vget_high_f32(vals));
  71     }
  72 }
  73
  74
  75 #define SUFFIX Neon
  76 #include "mixer_inc.c"
  77 #undef SUFFIX
  78
  79
  80 void MixDirect_Neon(const DirectParams *params, const ALfloat *restrict data, ALuint srcchan,
  81   ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize)
  82 {
  83     ALfloat (*restrict OutBuffer)[BUFFERSIZE] = params->OutBuffer;
  84     ALfloat *restrict ClickRemoval = params->ClickRemoval;
  85     ALfloat *restrict PendingClicks = params->PendingClicks;
  86     ALfloat DrySend;
  87     float32x4_t gain;
  88     ALuint pos;
  89     ALuint c;
  90
  91     for(c = 0;c < MaxChannels;c++)
  92     {
  93         DrySend = params->Gains[srcchan][c];
  94         if(!(DrySend > GAIN_SILENCE_THRESHOLD))
  95             continue;
  96
  97         if(OutPos == 0)
  98             ClickRemoval[c] -= data[0]*DrySend;
  99
 100         gain = vdupq_n_f32(DrySend);
 101         for(pos = 0;BufferSize-pos > 3;pos += 4)
 102         {
 103             const float32x4_t val4 = vld1q_f32(&data[pos]);
 104             float32x4_t dry4 = vld1q_f32(&OutBuffer[c][OutPos+pos]);
 105             dry4 = vaddq_f32(dry4, vmulq_f32(val4, gain));
 106             vst1q_f32(&OutBuffer[c][OutPos+pos], dry4);
 107         }
 108         for(;pos < BufferSize;pos++)
 109             OutBuffer[c][OutPos+pos] += data[pos]*DrySend;
 110
 111         if(OutPos+pos == SamplesToDo)
 112             PendingClicks[c] += data[pos]*DrySend;
 113     }
 114 }
 115
 116
 117 void MixSend_Neon(const SendParams *params, const ALfloat *restrict data,
 118   ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize)
 119 {
 120     ALfloat (*restrict OutBuffer)[BUFFERSIZE] = params->OutBuffer;
 121     ALfloat *restrict ClickRemoval = params->ClickRemoval;
 122     ALfloat *restrict PendingClicks = params->PendingClicks;
 123     ALfloat WetGain;
 124     float32x4_t gain;
 125     ALuint pos;
 126
 127     WetGain = params->Gain;
 128     if(!(WetGain > GAIN_SILENCE_THRESHOLD))
 129         return;
 130
 131     if(OutPos == 0)
 132         ClickRemoval[0] -= data[0] * WetGain;
 133
 134     gain = vdupq_n_f32(WetGain);
 135     for(pos = 0;BufferSize-pos > 3;pos += 4)
 136     {
 137         const float32x4_t val4 = vld1q_f32(&data[pos]);
 138         float32x4_t wet4 = vld1q_f32(&OutBuffer[0][OutPos+pos]);
 139         wet4 = vaddq_f32(wet4, vmulq_f32(val4, gain));
 140         vst1q_f32(&OutBuffer[0][OutPos+pos], wet4);
 141     }
 142     for(;pos < BufferSize;pos++)
 143         OutBuffer[0][OutPos+pos] += data[pos] * WetGain;
 144
 145     if(OutPos+pos == SamplesToDo)
 146         PendingClicks[0] += data[pos] * WetGain;
 147 }