13 const ALfloat
*Resample_lerp_Neon(const InterpState
* UNUSED(state
),
14 const ALfloat
*restrict src
, ALsizei frac
, ALint increment
,
15 ALfloat
*restrict dst
, ALsizei numsamples
)
17 const int32x4_t increment4
= vdupq_n_s32(increment
*4);
18 const float32x4_t fracOne4
= vdupq_n_f32(1.0f
/FRACTIONONE
);
19 const int32x4_t fracMask4
= vdupq_n_s32(FRACTIONMASK
);
20 alignas(16) ALint pos_
[4];
21 alignas(16) ALsizei frac_
[4];
26 ASSUME(numsamples
> 0);
28 InitiatePositionArrays(frac
, increment
, frac_
, pos_
, 4);
30 frac4
= vld1q_s32(frac_
);
31 pos4
= vld1q_s32(pos_
);
33 for(i
= 0;numsamples
-i
> 3;i
+= 4)
35 const float32x4_t val1
= (float32x4_t
){src
[pos_
[0]], src
[pos_
[1]], src
[pos_
[2]], src
[pos_
[3]]};
36 const float32x4_t val2
= (float32x4_t
){src
[pos_
[0]+1], src
[pos_
[1]+1], src
[pos_
[2]+1], src
[pos_
[3]+1]};
38 /* val1 + (val2-val1)*mu */
39 const float32x4_t r0
= vsubq_f32(val2
, val1
);
40 const float32x4_t mu
= vmulq_f32(vcvtq_f32_s32(frac4
), fracOne4
);
41 const float32x4_t out
= vmlaq_f32(val1
, mu
, r0
);
43 vst1q_f32(&dst
[i
], out
);
45 frac4
= vaddq_s32(frac4
, increment4
);
46 pos4
= vaddq_s32(pos4
, vshrq_n_s32(frac4
, FRACTIONBITS
));
47 frac4
= vandq_s32(frac4
, fracMask4
);
49 vst1q_s32(pos_
, pos4
);
54 /* NOTE: These four elements represent the position *after* the last
55 * four samples, so the lowest element is the next position to
59 frac
= vgetq_lane_s32(frac4
, 0);
61 dst
[i
] = lerp(src
[pos
], src
[pos
+1], frac
* (1.0f
/FRACTIONONE
));
64 pos
+= frac
>>FRACTIONBITS
;
66 } while(++i
< numsamples
);
71 const ALfloat
*Resample_bsinc_Neon(const InterpState
*state
,
72 const ALfloat
*restrict src
, ALsizei frac
, ALint increment
,
73 ALfloat
*restrict dst
, ALsizei dstlen
)
75 const ALfloat
*const filter
= state
->bsinc
.filter
;
76 const float32x4_t sf4
= vdupq_n_f32(state
->bsinc
.sf
);
77 const ALsizei m
= state
->bsinc
.m
;
78 const float32x4_t
*fil
, *scd
, *phd
, *spd
;
79 ALsizei pi
, i
, j
, offset
;
86 src
+= state
->bsinc
.l
;
87 for(i
= 0;i
< dstlen
;i
++)
89 // Calculate the phase index and factor.
90 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
91 pi
= frac
>> FRAC_PHASE_BITDIFF
;
92 pf
= (frac
& ((1<<FRAC_PHASE_BITDIFF
)-1)) * (1.0f
/(1<<FRAC_PHASE_BITDIFF
));
93 #undef FRAC_PHASE_BITDIFF
96 fil
= ASSUME_ALIGNED(filter
+ offset
, 16); offset
+= m
;
97 scd
= ASSUME_ALIGNED(filter
+ offset
, 16); offset
+= m
;
98 phd
= ASSUME_ALIGNED(filter
+ offset
, 16); offset
+= m
;
99 spd
= ASSUME_ALIGNED(filter
+ offset
, 16);
101 // Apply the scale and phase interpolated filter.
102 r4
= vdupq_n_f32(0.0f
);
104 const float32x4_t pf4
= vdupq_n_f32(pf
);
105 for(j
= 0;j
< m
;j
+=4,fil
++,scd
++,phd
++,spd
++)
107 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
108 const float32x4_t f4
= vmlaq_f32(
109 vmlaq_f32(*fil
, sf4
, *scd
),
110 pf4
, vmlaq_f32(*phd
, sf4
, *spd
)
113 r4
= vmlaq_f32(r4
, f4
, vld1q_f32(&src
[j
]));
116 r4
= vaddq_f32(r4
, vcombine_f32(vrev64_f32(vget_high_f32(r4
)),
117 vrev64_f32(vget_low_f32(r4
))));
118 dst
[i
] = vget_lane_f32(vadd_f32(vget_low_f32(r4
), vget_high_f32(r4
)), 0);
121 src
+= frac
>>FRACTIONBITS
;
122 frac
&= FRACTIONMASK
;
128 static inline void ApplyCoeffs(ALsizei Offset
, ALfloat (*restrict Values
)[2],
129 const ALsizei IrSize
,
130 const ALfloat (*restrict Coeffs
)[2],
131 ALfloat left
, ALfloat right
)
134 float32x4_t leftright4
;
136 float32x2_t leftright2
= vdup_n_f32(0.0);
137 leftright2
= vset_lane_f32(left
, leftright2
, 0);
138 leftright2
= vset_lane_f32(right
, leftright2
, 1);
139 leftright4
= vcombine_f32(leftright2
, leftright2
);
141 Values
= ASSUME_ALIGNED(Values
, 16);
142 Coeffs
= ASSUME_ALIGNED(Coeffs
, 16);
143 for(c
= 0;c
< IrSize
;c
+= 2)
145 const ALsizei o0
= (Offset
+c
)&HRIR_MASK
;
146 const ALsizei o1
= (o0
+1)&HRIR_MASK
;
147 float32x4_t vals
= vcombine_f32(vld1_f32((float32_t
*)&Values
[o0
][0]),
148 vld1_f32((float32_t
*)&Values
[o1
][0]));
149 float32x4_t coefs
= vld1q_f32((float32_t
*)&Coeffs
[c
][0]);
151 vals
= vmlaq_f32(vals
, coefs
, leftright4
);
153 vst1_f32((float32_t
*)&Values
[o0
][0], vget_low_f32(vals
));
154 vst1_f32((float32_t
*)&Values
[o1
][0], vget_high_f32(vals
));
158 #define MixHrtf MixHrtf_Neon
159 #define MixHrtfBlend MixHrtfBlend_Neon
160 #define MixDirectHrtf MixDirectHrtf_Neon
161 #include "hrtf_inc.c"
164 void Mix_Neon(const ALfloat
*data
, ALsizei OutChans
, ALfloat (*restrict OutBuffer
)[BUFFERSIZE
],
165 ALfloat
*CurrentGains
, const ALfloat
*TargetGains
, ALsizei Counter
, ALsizei OutPos
,
168 ALfloat gain
, delta
, step
;
172 ASSUME(OutChans
> 0);
173 ASSUME(BufferSize
> 0);
174 data
= ASSUME_ALIGNED(data
, 16);
175 OutBuffer
= ASSUME_ALIGNED(OutBuffer
, 16);
177 delta
= (Counter
> 0) ? 1.0f
/(ALfloat
)Counter
: 0.0f
;
179 for(c
= 0;c
< OutChans
;c
++)
182 gain
= CurrentGains
[c
];
183 step
= (TargetGains
[c
] - gain
) * delta
;
184 if(fabsf(step
) > FLT_EPSILON
)
186 ALsizei minsize
= mini(BufferSize
, Counter
);
187 /* Mix with applying gain steps in aligned multiples of 4. */
191 gain4
= vsetq_lane_f32(gain
, gain4
, 0);
192 gain4
= vsetq_lane_f32(gain
+ step
, gain4
, 1);
193 gain4
= vsetq_lane_f32(gain
+ step
+ step
, gain4
, 2);
194 gain4
= vsetq_lane_f32(gain
+ step
+ step
+ step
, gain4
, 3);
195 step4
= vdupq_n_f32(step
+ step
+ step
+ step
);
197 const float32x4_t val4
= vld1q_f32(&data
[pos
]);
198 float32x4_t dry4
= vld1q_f32(&OutBuffer
[c
][OutPos
+pos
]);
199 dry4
= vmlaq_f32(dry4
, val4
, gain4
);
200 gain4
= vaddq_f32(gain4
, step4
);
201 vst1q_f32(&OutBuffer
[c
][OutPos
+pos
], dry4
);
203 } while(minsize
-pos
> 3);
204 /* NOTE: gain4 now represents the next four gains after the
205 * last four mixed samples, so the lowest element represents
206 * the next gain to apply.
208 gain
= vgetq_lane_f32(gain4
, 0);
210 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
211 for(;pos
< minsize
;pos
++)
213 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
217 gain
= TargetGains
[c
];
218 CurrentGains
[c
] = gain
;
220 /* Mix until pos is aligned with 4 or the mix is done. */
221 minsize
= mini(BufferSize
, (pos
+3)&~3);
222 for(;pos
< minsize
;pos
++)
223 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
226 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
228 gain4
= vdupq_n_f32(gain
);
229 for(;BufferSize
-pos
> 3;pos
+= 4)
231 const float32x4_t val4
= vld1q_f32(&data
[pos
]);
232 float32x4_t dry4
= vld1q_f32(&OutBuffer
[c
][OutPos
+pos
]);
233 dry4
= vmlaq_f32(dry4
, val4
, gain4
);
234 vst1q_f32(&OutBuffer
[c
][OutPos
+pos
], dry4
);
236 for(;pos
< BufferSize
;pos
++)
237 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
241 void MixRow_Neon(ALfloat
*OutBuffer
, const ALfloat
*Gains
, const ALfloat (*restrict data
)[BUFFERSIZE
], ALsizei InChans
, ALsizei InPos
, ALsizei BufferSize
)
247 ASSUME(BufferSize
> 0);
248 data
= ASSUME_ALIGNED(data
, 16);
249 OutBuffer
= ASSUME_ALIGNED(OutBuffer
, 16);
251 for(c
= 0;c
< InChans
;c
++)
254 ALfloat gain
= Gains
[c
];
255 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
258 gain4
= vdupq_n_f32(gain
);
259 for(;BufferSize
-pos
> 3;pos
+= 4)
261 const float32x4_t val4
= vld1q_f32(&data
[c
][InPos
+pos
]);
262 float32x4_t dry4
= vld1q_f32(&OutBuffer
[pos
]);
263 dry4
= vmlaq_f32(dry4
, val4
, gain4
);
264 vst1q_f32(&OutBuffer
[pos
], dry4
);
266 for(;pos
< BufferSize
;pos
++)
267 OutBuffer
[pos
] += data
[c
][InPos
+pos
]*gain
;