10 #include "mixer_defs.h"
13 const ALfloat
*Resample_lerp32_Neon(const InterpState
* UNUSED(state
),
14 const ALfloat
*restrict src
, ALuint frac
, ALint increment
,
15 ALfloat
*restrict dst
, ALsizei numsamples
)
17 const int32x4_t increment4
= vdupq_n_s32(increment
*4);
18 const float32x4_t fracOne4
= vdupq_n_f32(1.0f
/FRACTIONONE
);
19 const uint32x4_t fracMask4
= vdupq_n_u32(FRACTIONMASK
);
20 alignas(16) ALint pos_
[4];
21 alignas(16) ALuint frac_
[4];
26 InitiatePositionArrays(frac
, increment
, frac_
, pos_
, 4);
28 frac4
= vld1q_u32(frac_
);
29 pos4
= vld1q_s32(pos_
);
31 for(i
= 0;numsamples
-i
> 3;i
+= 4)
33 const float32x4_t val1
= (float32x4_t
){src
[pos_
[0]], src
[pos_
[1]], src
[pos_
[2]], src
[pos_
[3]]};
34 const float32x4_t val2
= (float32x4_t
){src
[pos_
[0]+1], src
[pos_
[1]+1], src
[pos_
[2]+1], src
[pos_
[3]+1]};
36 /* val1 + (val2-val1)*mu */
37 const float32x4_t r0
= vsubq_f32(val2
, val1
);
38 const float32x4_t mu
= vmulq_f32(vcvtq_f32_u32(frac4
), fracOne4
);
39 const float32x4_t out
= vmlaq_f32(val1
, mu
, r0
);
41 vst1q_f32(&dst
[i
], out
);
43 frac4
= vaddq_u32(frac4
, (uint32x4_t
)increment4
);
44 pos4
= vaddq_s32(pos4
, (int32x4_t
)vshrq_n_u32(frac4
, FRACTIONBITS
));
45 frac4
= vandq_u32(frac4
, fracMask4
);
47 vst1q_s32(pos_
, pos4
);
52 /* NOTE: These four elements represent the position *after* the last
53 * four samples, so the lowest element is the next position to
57 frac
= vgetq_lane_u32(frac4
, 0);
59 dst
[i
] = lerp(src
[pos
], src
[pos
+1], frac
* (1.0f
/FRACTIONONE
));
62 pos
+= frac
>>FRACTIONBITS
;
64 } while(++i
< numsamples
);
69 const ALfloat
*Resample_fir4_32_Neon(const InterpState
* UNUSED(state
),
70 const ALfloat
*restrict src
, ALuint frac
, ALint increment
,
71 ALfloat
*restrict dst
, ALsizei numsamples
)
73 const int32x4_t increment4
= vdupq_n_s32(increment
*4);
74 const uint32x4_t fracMask4
= vdupq_n_u32(FRACTIONMASK
);
75 alignas(16) ALint pos_
[4];
76 alignas(16) ALuint frac_
[4];
81 InitiatePositionArrays(frac
, increment
, frac_
, pos_
, 4);
83 frac4
= vld1q_u32(frac_
);
84 pos4
= vld1q_s32(pos_
);
87 for(i
= 0;numsamples
-i
> 3;i
+= 4)
89 const float32x4_t val0
= vld1q_f32(&src
[pos_
[0]]);
90 const float32x4_t val1
= vld1q_f32(&src
[pos_
[1]]);
91 const float32x4_t val2
= vld1q_f32(&src
[pos_
[2]]);
92 const float32x4_t val3
= vld1q_f32(&src
[pos_
[3]]);
93 float32x4_t k0
= vld1q_f32(ResampleCoeffs
.FIR4
[frac_
[0]]);
94 float32x4_t k1
= vld1q_f32(ResampleCoeffs
.FIR4
[frac_
[1]]);
95 float32x4_t k2
= vld1q_f32(ResampleCoeffs
.FIR4
[frac_
[2]]);
96 float32x4_t k3
= vld1q_f32(ResampleCoeffs
.FIR4
[frac_
[3]]);
99 k0
= vmulq_f32(k0
, val0
);
100 k1
= vmulq_f32(k1
, val1
);
101 k2
= vmulq_f32(k2
, val2
);
102 k3
= vmulq_f32(k3
, val3
);
103 k0
= vcombine_f32(vpadd_f32(vget_low_f32(k0
), vget_high_f32(k0
)),
104 vpadd_f32(vget_low_f32(k1
), vget_high_f32(k1
)));
105 k2
= vcombine_f32(vpadd_f32(vget_low_f32(k2
), vget_high_f32(k2
)),
106 vpadd_f32(vget_low_f32(k3
), vget_high_f32(k3
)));
107 out
= vcombine_f32(vpadd_f32(vget_low_f32(k0
), vget_high_f32(k0
)),
108 vpadd_f32(vget_low_f32(k2
), vget_high_f32(k2
)));
110 vst1q_f32(&dst
[i
], out
);
112 frac4
= vaddq_u32(frac4
, (uint32x4_t
)increment4
);
113 pos4
= vaddq_s32(pos4
, (int32x4_t
)vshrq_n_u32(frac4
, FRACTIONBITS
));
114 frac4
= vandq_u32(frac4
, fracMask4
);
116 vst1q_s32(pos_
, pos4
);
117 vst1q_u32(frac_
, frac4
);
122 /* NOTE: These four elements represent the position *after* the last
123 * four samples, so the lowest element is the next position to
129 dst
[i
] = resample_fir4(src
[pos
], src
[pos
+1], src
[pos
+2], src
[pos
+3], frac
);
132 pos
+= frac
>>FRACTIONBITS
;
133 frac
&= FRACTIONMASK
;
134 } while(++i
< numsamples
);
139 const ALfloat
*Resample_fir8_32_Neon(const InterpState
* UNUSED(state
),
140 const ALfloat
*restrict src
, ALuint frac
, ALint increment
,
141 ALfloat
*restrict dst
, ALsizei numsamples
)
143 const int32x4_t increment4
= vdupq_n_s32(increment
*4);
144 const uint32x4_t fracMask4
= vdupq_n_u32(FRACTIONMASK
);
145 alignas(16) ALint pos_
[4];
146 alignas(16) ALuint frac_
[4];
151 InitiatePositionArrays(frac
, increment
, frac_
, pos_
, 4);
153 frac4
= vld1q_u32(frac_
);
154 pos4
= vld1q_s32(pos_
);
157 for(i
= 0;numsamples
-i
> 3;i
+= 4)
160 for(j
= 0;j
< 8;j
+=4)
162 const float32x4_t val0
= vld1q_f32(&src
[pos_
[0]+j
]);
163 const float32x4_t val1
= vld1q_f32(&src
[pos_
[1]+j
]);
164 const float32x4_t val2
= vld1q_f32(&src
[pos_
[2]+j
]);
165 const float32x4_t val3
= vld1q_f32(&src
[pos_
[3]+j
]);
166 float32x4_t k0
= vld1q_f32(&ResampleCoeffs
.FIR4
[frac_
[0]][j
]);
167 float32x4_t k1
= vld1q_f32(&ResampleCoeffs
.FIR4
[frac_
[1]][j
]);
168 float32x4_t k2
= vld1q_f32(&ResampleCoeffs
.FIR4
[frac_
[2]][j
]);
169 float32x4_t k3
= vld1q_f32(&ResampleCoeffs
.FIR4
[frac_
[3]][j
]);
171 k0
= vmulq_f32(k0
, val0
);
172 k1
= vmulq_f32(k1
, val1
);
173 k2
= vmulq_f32(k2
, val2
);
174 k3
= vmulq_f32(k3
, val3
);
175 k0
= vcombine_f32(vpadd_f32(vget_low_f32(k0
), vget_high_f32(k0
)),
176 vpadd_f32(vget_low_f32(k1
), vget_high_f32(k1
)));
177 k2
= vcombine_f32(vpadd_f32(vget_low_f32(k2
), vget_high_f32(k2
)),
178 vpadd_f32(vget_low_f32(k3
), vget_high_f32(k3
)));
179 out
[j
>>2] = vcombine_f32(vpadd_f32(vget_low_f32(k0
), vget_high_f32(k0
)),
180 vpadd_f32(vget_low_f32(k2
), vget_high_f32(k2
)));
183 out
[0] = vaddq_f32(out
[0], out
[1]);
184 vst1q_f32(&dst
[i
], out
[0]);
186 frac4
= vaddq_u32(frac4
, (uint32x4_t
)increment4
);
187 pos4
= vaddq_s32(pos4
, (int32x4_t
)vshrq_n_u32(frac4
, FRACTIONBITS
));
188 frac4
= vandq_u32(frac4
, fracMask4
);
190 vst1q_s32(pos_
, pos4
);
191 vst1q_u32(frac_
, frac4
);
196 /* NOTE: These four elements represent the position *after* the last
197 * four samples, so the lowest element is the next position to
203 dst
[i
] = resample_fir8(src
[pos
], src
[pos
+1], src
[pos
+2], src
[pos
+3],
204 src
[pos
+4], src
[pos
+5], src
[pos
+6], src
[pos
+7], frac
);
207 pos
+= frac
>>FRACTIONBITS
;
208 frac
&= FRACTIONMASK
;
209 } while(++i
< numsamples
);
214 const ALfloat
*Resample_bsinc32_Neon(const InterpState
*state
,
215 const ALfloat
*restrict src
, ALuint frac
, ALint increment
,
216 ALfloat
*restrict dst
, ALsizei dstlen
)
218 const float32x4_t sf4
= vdupq_n_f32(state
->bsinc
.sf
);
219 const ALsizei m
= state
->bsinc
.m
;
220 const ALfloat
*fil
, *scd
, *phd
, *spd
;
225 src
+= state
->bsinc
.l
;
226 for(i
= 0;i
< dstlen
;i
++)
228 // Calculate the phase index and factor.
229 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
230 pi
= frac
>> FRAC_PHASE_BITDIFF
;
231 pf
= (frac
& ((1<<FRAC_PHASE_BITDIFF
)-1)) * (1.0f
/(1<<FRAC_PHASE_BITDIFF
));
232 #undef FRAC_PHASE_BITDIFF
234 fil
= ASSUME_ALIGNED(state
->bsinc
.coeffs
[pi
].filter
, 16);
235 scd
= ASSUME_ALIGNED(state
->bsinc
.coeffs
[pi
].scDelta
, 16);
236 phd
= ASSUME_ALIGNED(state
->bsinc
.coeffs
[pi
].phDelta
, 16);
237 spd
= ASSUME_ALIGNED(state
->bsinc
.coeffs
[pi
].spDelta
, 16);
239 // Apply the scale and phase interpolated filter.
240 r4
= vdupq_n_f32(0.0f
);
242 const float32x4_t pf4
= vdupq_n_f32(pf
);
243 for(j
= 0;j
< m
;j
+=4)
245 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
246 const float32x4_t f4
= vmlaq_f32(vmlaq_f32(vld1q_f32(&fil
[j
]),
247 sf4
, vld1q_f32(&scd
[j
])),
248 pf4
, vmlaq_f32(vld1q_f32(&phd
[j
]),
249 sf4
, vld1q_f32(&spd
[j
])
253 r4
= vmlaq_f32(r4
, f4
, vld1q_f32(&src
[j
]));
256 r4
= vaddq_f32(r4
, vcombine_f32(vrev64_f32(vget_high_f32(r4
)),
257 vrev64_f32(vget_low_f32(r4
))));
258 dst
[i
] = vget_lane_f32(vadd_f32(vget_low_f32(r4
), vget_high_f32(r4
)), 0);
261 src
+= frac
>>FRACTIONBITS
;
262 frac
&= FRACTIONMASK
;
268 static inline void ApplyCoeffsStep(ALsizei Offset
, ALfloat (*restrict Values
)[2],
269 const ALsizei IrSize
,
270 ALfloat (*restrict Coeffs
)[2],
271 const ALfloat (*restrict CoeffStep
)[2],
272 ALfloat left
, ALfloat right
)
275 float32x4_t leftright4
;
277 float32x2_t leftright2
= vdup_n_f32(0.0);
278 leftright2
= vset_lane_f32(left
, leftright2
, 0);
279 leftright2
= vset_lane_f32(right
, leftright2
, 1);
280 leftright4
= vcombine_f32(leftright2
, leftright2
);
282 Values
= ASSUME_ALIGNED(Values
, 16);
283 Coeffs
= ASSUME_ALIGNED(Coeffs
, 16);
284 CoeffStep
= ASSUME_ALIGNED(CoeffStep
, 16);
285 for(c
= 0;c
< IrSize
;c
+= 2)
287 const ALsizei o0
= (Offset
+c
)&HRIR_MASK
;
288 const ALsizei o1
= (o0
+1)&HRIR_MASK
;
289 float32x4_t vals
= vcombine_f32(vld1_f32((float32_t
*)&Values
[o0
][0]),
290 vld1_f32((float32_t
*)&Values
[o1
][0]));
291 float32x4_t coefs
= vld1q_f32((float32_t
*)&Coeffs
[c
][0]);
292 float32x4_t deltas
= vld1q_f32(&CoeffStep
[c
][0]);
294 vals
= vmlaq_f32(vals
, coefs
, leftright4
);
295 coefs
= vaddq_f32(coefs
, deltas
);
297 vst1_f32((float32_t
*)&Values
[o0
][0], vget_low_f32(vals
));
298 vst1_f32((float32_t
*)&Values
[o1
][0], vget_high_f32(vals
));
299 vst1q_f32(&Coeffs
[c
][0], coefs
);
303 static inline void ApplyCoeffs(ALsizei Offset
, ALfloat (*restrict Values
)[2],
304 const ALsizei IrSize
,
305 ALfloat (*restrict Coeffs
)[2],
306 ALfloat left
, ALfloat right
)
309 float32x4_t leftright4
;
311 float32x2_t leftright2
= vdup_n_f32(0.0);
312 leftright2
= vset_lane_f32(left
, leftright2
, 0);
313 leftright2
= vset_lane_f32(right
, leftright2
, 1);
314 leftright4
= vcombine_f32(leftright2
, leftright2
);
316 Values
= ASSUME_ALIGNED(Values
, 16);
317 Coeffs
= ASSUME_ALIGNED(Coeffs
, 16);
318 for(c
= 0;c
< IrSize
;c
+= 2)
320 const ALsizei o0
= (Offset
+c
)&HRIR_MASK
;
321 const ALsizei o1
= (o0
+1)&HRIR_MASK
;
322 float32x4_t vals
= vcombine_f32(vld1_f32((float32_t
*)&Values
[o0
][0]),
323 vld1_f32((float32_t
*)&Values
[o1
][0]));
324 float32x4_t coefs
= vld1q_f32((float32_t
*)&Coeffs
[c
][0]);
326 vals
= vmlaq_f32(vals
, coefs
, leftright4
);
328 vst1_f32((float32_t
*)&Values
[o0
][0], vget_low_f32(vals
));
329 vst1_f32((float32_t
*)&Values
[o1
][0], vget_high_f32(vals
));
333 #define MixHrtf MixHrtf_Neon
334 #define MixDirectHrtf MixDirectHrtf_Neon
335 #include "mixer_inc.c"
339 void Mix_Neon(const ALfloat
*data
, ALsizei OutChans
, ALfloat (*restrict OutBuffer
)[BUFFERSIZE
],
340 ALfloat
*CurrentGains
, const ALfloat
*TargetGains
, ALsizei Counter
, ALsizei OutPos
,
343 ALfloat gain
, delta
, step
;
347 data
= ASSUME_ALIGNED(data
, 16);
348 OutBuffer
= ASSUME_ALIGNED(OutBuffer
, 16);
350 delta
= (Counter
> 0) ? 1.0f
/(ALfloat
)Counter
: 0.0f
;
352 for(c
= 0;c
< OutChans
;c
++)
355 gain
= CurrentGains
[c
];
356 step
= (TargetGains
[c
] - gain
) * delta
;
357 if(fabsf(step
) > FLT_EPSILON
)
359 ALsizei minsize
= mini(BufferSize
, Counter
);
360 /* Mix with applying gain steps in aligned multiples of 4. */
364 gain4
= vsetq_lane_f32(gain
, gain4
, 0);
365 gain4
= vsetq_lane_f32(gain
+ step
, gain4
, 1);
366 gain4
= vsetq_lane_f32(gain
+ step
+ step
, gain4
, 2);
367 gain4
= vsetq_lane_f32(gain
+ step
+ step
+ step
, gain4
, 3);
368 step4
= vdupq_n_f32(step
+ step
+ step
+ step
);
370 const float32x4_t val4
= vld1q_f32(&data
[pos
]);
371 float32x4_t dry4
= vld1q_f32(&OutBuffer
[c
][OutPos
+pos
]);
372 dry4
= vmlaq_f32(dry4
, val4
, gain4
);
373 gain4
= vaddq_f32(gain4
, step4
);
374 vst1q_f32(&OutBuffer
[c
][OutPos
+pos
], dry4
);
376 } while(minsize
-pos
> 3);
377 /* NOTE: gain4 now represents the next four gains after the
378 * last four mixed samples, so the lowest element represents
379 * the next gain to apply.
381 gain
= vgetq_lane_f32(gain4
, 0);
383 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
384 for(;pos
< minsize
;pos
++)
386 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
390 gain
= TargetGains
[c
];
391 CurrentGains
[c
] = gain
;
393 /* Mix until pos is aligned with 4 or the mix is done. */
394 minsize
= mini(BufferSize
, (pos
+3)&~3);
395 for(;pos
< minsize
;pos
++)
396 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
399 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
401 gain4
= vdupq_n_f32(gain
);
402 for(;BufferSize
-pos
> 3;pos
+= 4)
404 const float32x4_t val4
= vld1q_f32(&data
[pos
]);
405 float32x4_t dry4
= vld1q_f32(&OutBuffer
[c
][OutPos
+pos
]);
406 dry4
= vmlaq_f32(dry4
, val4
, gain4
);
407 vst1q_f32(&OutBuffer
[c
][OutPos
+pos
], dry4
);
409 for(;pos
< BufferSize
;pos
++)
410 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
414 void MixRow_Neon(ALfloat
*OutBuffer
, const ALfloat
*Gains
, const ALfloat (*restrict data
)[BUFFERSIZE
], ALsizei InChans
, ALsizei InPos
, ALsizei BufferSize
)
419 data
= ASSUME_ALIGNED(data
, 16);
420 OutBuffer
= ASSUME_ALIGNED(OutBuffer
, 16);
422 for(c
= 0;c
< InChans
;c
++)
425 ALfloat gain
= Gains
[c
];
426 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
429 gain4
= vdupq_n_f32(gain
);
430 for(;BufferSize
-pos
> 3;pos
+= 4)
432 const float32x4_t val4
= vld1q_f32(&data
[c
][InPos
+pos
]);
433 float32x4_t dry4
= vld1q_f32(&OutBuffer
[pos
]);
434 dry4
= vmlaq_f32(dry4
, val4
, gain4
);
435 vst1q_f32(&OutBuffer
[pos
], dry4
);
437 for(;pos
< BufferSize
;pos
++)
438 OutBuffer
[pos
] += data
[c
][InPos
+pos
]*gain
;