Avoid building redundant mixers
[openal-soft/openal-hmr.git] / Alc / mixer_sse.c
blob6a53115adbdca27b7b38d70c849899deaba03a8c
1 #include "config.h"
3 #ifdef HAVE_XMMINTRIN_H
4 #include <xmmintrin.h>
5 #endif
7 #include "AL/al.h"
8 #include "AL/alc.h"
9 #include "alMain.h"
10 #include "alu.h"
12 #include "alSource.h"
13 #include "alAuxEffectSlot.h"
14 #include "mixer_defs.h"
16 static __inline ALfloat lerp32(const ALfloat *vals, ALint step, ALuint frac)
17 { return lerp(vals[0], vals[step], frac * (1.0f/FRACTIONONE)); }
19 void Resample_lerp32_SSE(const ALfloat *data, ALuint frac,
20 ALuint increment, ALuint NumChannels, ALfloat *RESTRICT OutBuffer,
21 ALuint BufferSize)
23 ALIGN(16) float value[3][4];
24 ALuint pos = 0;
25 ALuint i, j;
27 for(i = 0;i < BufferSize+1-3;i+=4)
29 __m128 x, y, a;
30 for(j = 0;j < 4;j++)
32 value[0][j] = data[(pos )*NumChannels];
33 value[1][j] = data[(pos+1)*NumChannels];
34 value[2][j] = frac * (1.0f/FRACTIONONE);
36 frac += increment;
37 pos += frac>>FRACTIONBITS;
38 frac &= FRACTIONMASK;
41 x = _mm_load_ps(value[0]);
42 y = _mm_load_ps(value[1]);
43 y = _mm_sub_ps(y, x);
45 a = _mm_load_ps(value[2]);
46 y = _mm_mul_ps(y, a);
48 x = _mm_add_ps(x, y);
50 _mm_store_ps(&OutBuffer[i], x);
52 for(;i < BufferSize+1;i++)
54 OutBuffer[i] = lerp32(data + pos*NumChannels, NumChannels, frac);
56 frac += increment;
57 pos += frac>>FRACTIONBITS;
58 frac &= FRACTIONMASK;
62 void Resample_cubic32_SSE(const ALfloat *data, ALuint frac,
63 ALuint increment, ALuint NumChannels, ALfloat *RESTRICT OutBuffer,
64 ALuint BufferSize)
66 /* Cubic interpolation mainly consists of a matrix4 * vector4 operation,
67 * followed by scalars being applied to the resulting elements before all
68 * four are added together for the final sample. */
69 static const __m128 matrix[4] = {
70 { -0.5, 1.0f, -0.5f, 0.0f },
71 { 1.5, -2.5f, 0.0f, 1.0f },
72 { -1.5, 2.0f, 0.5f, 0.0f },
73 { 0.5, -0.5f, 0.0f, 0.0f },
75 ALIGN(16) float value[4];
76 ALuint pos = 0;
77 ALuint i, j;
79 for(i = 0;i < BufferSize+1-3;i+=4)
81 __m128 result, final[4];
83 for(j = 0;j < 4;j++)
85 __m128 val4, s;
86 ALfloat mu;
88 val4 = _mm_set_ps(data[(pos-1)*NumChannels],
89 data[(pos )*NumChannels],
90 data[(pos+1)*NumChannels],
91 data[(pos+2)*NumChannels]);
92 mu = frac * (1.0f/FRACTIONONE);
93 s = _mm_set_ps(1.0f, mu, mu*mu, mu*mu*mu);
95 /* result = matrix * val4 */
96 result = _mm_mul_ps(val4, matrix[0]) ;
97 result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[1]));
98 result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[2]));
99 result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[3]));
101 /* final[j] = result * { mu^0, mu^1, mu^2, mu^3 } */
102 final[j] = _mm_mul_ps(result, s);
104 frac += increment;
105 pos += frac>>FRACTIONBITS;
106 frac &= FRACTIONMASK;
108 /* Transpose the final "matrix" so adding the rows will give the four
109 * samples. TODO: Is this faster than doing..
110 * _mm_store_ps(value, result);
111 * OutBuffer[i] = value[0] + value[1] + value[2] + value[3];
112 * ..for each sample?
114 _MM_TRANSPOSE4_PS(final[0], final[1], final[2], final[3]);
115 result = _mm_add_ps(_mm_add_ps(final[0], final[1]),
116 _mm_add_ps(final[2], final[3]));
118 _mm_store_ps(&OutBuffer[i], result);
120 for(;i < BufferSize+1;i++)
122 __m128 val4, s, result;
123 ALfloat mu;
125 val4 = _mm_set_ps(data[(pos-1)*NumChannels],
126 data[(pos )*NumChannels],
127 data[(pos+1)*NumChannels],
128 data[(pos+2)*NumChannels]);
129 mu = frac * (1.0f/FRACTIONONE);
130 s = _mm_set_ps(1.0f, mu, mu*mu, mu*mu*mu);
132 /* result = matrix * val4 */
133 result = _mm_mul_ps(val4, matrix[0]) ;
134 result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[1]));
135 result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[2]));
136 result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[3]));
138 /* value = result * { mu^0, mu^1, mu^2, mu^3 } */
139 _mm_store_ps(value, _mm_mul_ps(result, s));
141 OutBuffer[i] = value[0] + value[1] + value[2] + value[3];
143 frac += increment;
144 pos += frac>>FRACTIONBITS;
145 frac &= FRACTIONMASK;
150 static __inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*RESTRICT Values)[2],
151 const ALuint IrSize,
152 ALfloat (*RESTRICT Coeffs)[2],
153 ALfloat (*RESTRICT CoeffStep)[2],
154 ALfloat left, ALfloat right)
156 const __m128 lrlr = { left, right, left, right };
157 __m128 coeffs, deltas, imp0, imp1;
158 __m128 vals = _mm_setzero_ps();
159 ALuint i;
161 if((Offset&1))
163 const ALuint o0 = Offset&HRIR_MASK;
164 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
166 coeffs = _mm_load_ps(&Coeffs[0][0]);
167 deltas = _mm_load_ps(&CoeffStep[0][0]);
168 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
169 imp0 = _mm_mul_ps(lrlr, coeffs);
170 coeffs = _mm_add_ps(coeffs, deltas);
171 vals = _mm_add_ps(imp0, vals);
172 _mm_store_ps(&Coeffs[0][0], coeffs);
173 _mm_storel_pi((__m64*)&Values[o0][0], vals);
174 for(i = 1;i < IrSize-1;i += 2)
176 const ALuint o2 = (Offset+i)&HRIR_MASK;
178 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
179 deltas = _mm_load_ps(&CoeffStep[i+1][0]);
180 vals = _mm_load_ps(&Values[o2][0]);
181 imp1 = _mm_mul_ps(lrlr, coeffs);
182 coeffs = _mm_add_ps(coeffs, deltas);
183 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
184 vals = _mm_add_ps(imp0, vals);
185 _mm_store_ps(&Coeffs[i+1][0], coeffs);
186 _mm_store_ps(&Values[o2][0], vals);
187 imp0 = imp1;
189 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
190 imp0 = _mm_movehl_ps(imp0, imp0);
191 vals = _mm_add_ps(imp0, vals);
192 _mm_storel_pi((__m64*)&Values[o1][0], vals);
194 else
196 for(i = 0;i < IrSize;i += 2)
198 const ALuint o = (Offset + i)&HRIR_MASK;
200 coeffs = _mm_load_ps(&Coeffs[i][0]);
201 deltas = _mm_load_ps(&CoeffStep[i][0]);
202 vals = _mm_load_ps(&Values[o][0]);
203 imp0 = _mm_mul_ps(lrlr, coeffs);
204 coeffs = _mm_add_ps(coeffs, deltas);
205 vals = _mm_add_ps(imp0, vals);
206 _mm_store_ps(&Coeffs[i][0], coeffs);
207 _mm_store_ps(&Values[o][0], vals);
212 static __inline void ApplyCoeffs(ALuint Offset, ALfloat (*RESTRICT Values)[2],
213 const ALuint IrSize,
214 ALfloat (*RESTRICT Coeffs)[2],
215 ALfloat left, ALfloat right)
217 const __m128 lrlr = { left, right, left, right };
218 __m128 vals = _mm_setzero_ps();
219 __m128 coeffs;
220 ALuint i;
222 if((Offset&1))
224 const ALuint o0 = Offset&HRIR_MASK;
225 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
226 __m128 imp0, imp1;
228 coeffs = _mm_load_ps(&Coeffs[0][0]);
229 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
230 imp0 = _mm_mul_ps(lrlr, coeffs);
231 vals = _mm_add_ps(imp0, vals);
232 _mm_storel_pi((__m64*)&Values[o0][0], vals);
233 for(i = 1;i < IrSize-1;i += 2)
235 const ALuint o2 = (Offset+i)&HRIR_MASK;
237 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
238 vals = _mm_load_ps(&Values[o2][0]);
239 imp1 = _mm_mul_ps(lrlr, coeffs);
240 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
241 vals = _mm_add_ps(imp0, vals);
242 _mm_store_ps(&Values[o2][0], vals);
243 imp0 = imp1;
245 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
246 imp0 = _mm_movehl_ps(imp0, imp0);
247 vals = _mm_add_ps(imp0, vals);
248 _mm_storel_pi((__m64*)&Values[o1][0], vals);
250 else
252 for(i = 0;i < IrSize;i += 2)
254 const ALuint o = (Offset + i)&HRIR_MASK;
256 coeffs = _mm_load_ps(&Coeffs[i][0]);
257 vals = _mm_load_ps(&Values[o][0]);
258 vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs));
259 _mm_store_ps(&Values[o][0], vals);
264 #define SUFFIX SSE
265 #include "mixer_inc.c"
266 #undef SUFFIX
269 void MixDirect_SSE(ALsource *Source, ALCdevice *Device, DirectParams *params,
270 const ALfloat *RESTRICT data, ALuint srcchan,
271 ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize)
273 ALfloat (*RESTRICT DryBuffer)[BUFFERSIZE] = Device->DryBuffer;
274 ALfloat *RESTRICT ClickRemoval = Device->ClickRemoval;
275 ALfloat *RESTRICT PendingClicks = Device->PendingClicks;
276 ALfloat DrySend[MaxChannels];
277 ALuint pos;
278 ALuint c;
279 (void)Source;
281 for(c = 0;c < MaxChannels;c++)
282 DrySend[c] = params->Gains[srcchan][c];
284 pos = 0;
285 if(OutPos == 0)
287 for(c = 0;c < MaxChannels;c++)
288 ClickRemoval[c] -= data[pos]*DrySend[c];
290 for(c = 0;c < MaxChannels;c++)
292 const __m128 gain = _mm_set1_ps(DrySend[c]);
293 for(pos = 0;pos < BufferSize-3;pos += 4)
295 const __m128 val4 = _mm_load_ps(&data[pos]);
296 __m128 dry4 = _mm_load_ps(&DryBuffer[c][OutPos+pos]);
297 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain));
298 _mm_store_ps(&DryBuffer[c][OutPos+pos], dry4);
301 if(pos < BufferSize)
303 ALuint oldpos = pos;
304 for(c = 0;c < MaxChannels;c++)
306 pos = oldpos;
307 for(;pos < BufferSize;pos++)
308 DryBuffer[c][OutPos+pos] += data[pos]*DrySend[c];
311 if(OutPos+pos == SamplesToDo)
313 for(c = 0;c < MaxChannels;c++)
314 PendingClicks[c] += data[pos]*DrySend[c];
319 void MixSend_SSE(SendParams *params, const ALfloat *RESTRICT data,
320 ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize)
322 ALeffectslot *Slot = params->Slot;
323 ALfloat *RESTRICT WetBuffer = Slot->WetBuffer;
324 ALfloat *RESTRICT WetClickRemoval = Slot->ClickRemoval;
325 ALfloat *RESTRICT WetPendingClicks = Slot->PendingClicks;
326 const ALfloat WetGain = params->Gain;
327 const __m128 gain = _mm_set1_ps(WetGain);
328 ALuint pos;
330 pos = 0;
331 if(OutPos == 0)
332 WetClickRemoval[0] -= data[pos] * WetGain;
333 for(pos = 0;pos < BufferSize-3;pos+=4)
335 const __m128 val4 = _mm_load_ps(&data[pos]);
336 __m128 wet4 = _mm_load_ps(&WetBuffer[OutPos+pos]);
337 wet4 = _mm_add_ps(wet4, _mm_mul_ps(val4, gain));
338 _mm_store_ps(&WetBuffer[OutPos+pos], wet4);
340 for(;pos < BufferSize;pos++)
341 WetBuffer[OutPos+pos] += data[pos] * WetGain;
342 if(OutPos == SamplesToDo)
343 WetPendingClicks[0] += data[pos] * WetGain;