Accept a "narrow" layout for 7.1 with mmdevapi
[openal-soft.git] / Alc / mixer_neon.c
blob8f79186d591006ac12dd6da63cb83f2210210428
1 #include "config.h"
3 #include <arm_neon.h>
5 #include "AL/al.h"
6 #include "AL/alc.h"
7 #include "alMain.h"
8 #include "alu.h"
9 #include "hrtf.h"
12 static inline void SetupCoeffs(ALfloat (*restrict OutCoeffs)[2],
13 const HrtfParams *hrtfparams,
14 ALuint IrSize, ALuint Counter)
16 ALuint c;
17 float32x4_t counter4;
19 float32x2_t counter2 = vdup_n_f32(-(float)Counter);
20 counter4 = vcombine_f32(counter2, counter2);
22 for(c = 0;c < IrSize;c += 2)
24 float32x4_t step4 = vld1q_f32((float32_t*)hrtfparams->CoeffStep[c]);
25 float32x4_t coeffs = vld1q_f32((float32_t*)hrtfparams->Coeffs[c]);
26 coeffs = vmlaq_f32(coeffs, step4, counter4);
27 vst1q_f32((float32_t*)OutCoeffs[c], coeffs);
31 static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
32 const ALuint IrSize,
33 ALfloat (*restrict Coeffs)[2],
34 const ALfloat (*restrict CoeffStep)[2],
35 ALfloat left, ALfloat right)
37 ALuint c;
38 float32x4_t leftright4;
40 float32x2_t leftright2 = vdup_n_f32(0.0);
41 leftright2 = vset_lane_f32(left, leftright2, 0);
42 leftright2 = vset_lane_f32(right, leftright2, 1);
43 leftright4 = vcombine_f32(leftright2, leftright2);
45 for(c = 0;c < IrSize;c += 2)
47 const ALuint o0 = (Offset+c)&HRIR_MASK;
48 const ALuint o1 = (o0+1)&HRIR_MASK;
49 float32x4_t vals = vcombine_f32(vld1_f32((float32_t*)&Values[o0][0]),
50 vld1_f32((float32_t*)&Values[o1][0]));
51 float32x4_t coefs = vld1q_f32((float32_t*)&Coeffs[c][0]);
52 float32x4_t deltas = vld1q_f32(&CoeffStep[c][0]);
54 vals = vmlaq_f32(vals, coefs, leftright4);
55 coefs = vaddq_f32(coefs, deltas);
57 vst1_f32((float32_t*)&Values[o0][0], vget_low_f32(vals));
58 vst1_f32((float32_t*)&Values[o1][0], vget_high_f32(vals));
59 vst1q_f32(&Coeffs[c][0], coefs);
63 static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
64 const ALuint IrSize,
65 ALfloat (*restrict Coeffs)[2],
66 ALfloat left, ALfloat right)
68 ALuint c;
69 float32x4_t leftright4;
71 float32x2_t leftright2 = vdup_n_f32(0.0);
72 leftright2 = vset_lane_f32(left, leftright2, 0);
73 leftright2 = vset_lane_f32(right, leftright2, 1);
74 leftright4 = vcombine_f32(leftright2, leftright2);
76 for(c = 0;c < IrSize;c += 2)
78 const ALuint o0 = (Offset+c)&HRIR_MASK;
79 const ALuint o1 = (o0+1)&HRIR_MASK;
80 float32x4_t vals = vcombine_f32(vld1_f32((float32_t*)&Values[o0][0]),
81 vld1_f32((float32_t*)&Values[o1][0]));
82 float32x4_t coefs = vld1q_f32((float32_t*)&Coeffs[c][0]);
84 vals = vmlaq_f32(vals, coefs, leftright4);
86 vst1_f32((float32_t*)&Values[o0][0], vget_low_f32(vals));
87 vst1_f32((float32_t*)&Values[o1][0], vget_high_f32(vals));
92 #define SUFFIX Neon
93 #include "mixer_inc.c"
94 #undef SUFFIX
97 void Mix_Neon(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE],
98 MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize)
100 ALfloat gain, step;
101 float32x4_t gain4;
102 ALuint c;
104 for(c = 0;c < OutChans;c++)
106 ALuint pos = 0;
107 gain = Gains[c].Current;
108 step = Gains[c].Step;
109 if(step != 0.0f && Counter > 0)
111 for(;pos < BufferSize && pos < Counter;pos++)
113 OutBuffer[c][OutPos+pos] += data[pos]*gain;
114 gain += step;
116 if(pos == Counter)
117 gain = Gains[c].Target;
118 Gains[c].Current = gain;
119 /* Mix until pos is aligned with 4 or the mix is done. */
120 for(;pos < BufferSize && (pos&3) != 0;pos++)
121 OutBuffer[c][OutPos+pos] += data[pos]*gain;
124 if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
125 continue;
126 gain4 = vdupq_n_f32(gain);
127 for(;BufferSize-pos > 3;pos += 4)
129 const float32x4_t val4 = vld1q_f32(&data[pos]);
130 float32x4_t dry4 = vld1q_f32(&OutBuffer[c][OutPos+pos]);
131 dry4 = vaddq_f32(dry4, vmulq_f32(val4, gain4));
132 vst1q_f32(&OutBuffer[c][OutPos+pos], dry4);
134 for(;pos < BufferSize;pos++)
135 OutBuffer[c][OutPos+pos] += data[pos]*gain;