Fix matrix multiply used by the SSE cubic resampler
[openal-soft.git] / Alc / mixer_sse.c
blob22a7db40125b3d41d6a1f9dd548190c2581dfcf7
1 #include "config.h"
3 #ifdef HAVE_XMMINTRIN_H
4 #include <xmmintrin.h>
5 #endif
7 #include "AL/al.h"
8 #include "AL/alc.h"
9 #include "alMain.h"
10 #include "alu.h"
12 #include "alSource.h"
13 #include "alAuxEffectSlot.h"
14 #include "mixer_defs.h"
16 static __inline ALfloat lerp32(const ALfloat *vals, ALint step, ALuint frac)
17 { return lerp(vals[0], vals[step], frac * (1.0f/FRACTIONONE)); }
19 void Resample_lerp32_SSE(const ALfloat *data, ALuint frac,
20 ALuint increment, ALuint NumChannels, ALfloat *RESTRICT OutBuffer,
21 ALuint BufferSize)
23 ALIGN(16) float value[3][4];
24 ALuint pos = 0;
25 ALuint i, j;
27 for(i = 0;i < BufferSize+1-3;i+=4)
29 __m128 x, y, a;
30 for(j = 0;j < 4;j++)
32 value[0][j] = data[(pos )*NumChannels];
33 value[1][j] = data[(pos+1)*NumChannels];
34 value[2][j] = frac * (1.0f/FRACTIONONE);
36 frac += increment;
37 pos += frac>>FRACTIONBITS;
38 frac &= FRACTIONMASK;
41 x = _mm_load_ps(value[0]);
42 y = _mm_load_ps(value[1]);
43 y = _mm_sub_ps(y, x);
45 a = _mm_load_ps(value[2]);
46 y = _mm_mul_ps(y, a);
48 x = _mm_add_ps(x, y);
50 _mm_store_ps(&OutBuffer[i], x);
52 for(;i < BufferSize+1;i++)
54 OutBuffer[i] = lerp32(data + pos*NumChannels, NumChannels, frac);
56 frac += increment;
57 pos += frac>>FRACTIONBITS;
58 frac &= FRACTIONMASK;
62 void Resample_cubic32_SSE(const ALfloat *data, ALuint frac,
63 ALuint increment, ALuint channels, ALfloat *RESTRICT OutBuffer,
64 ALuint BufferSize)
66 /* Cubic interpolation mainly consists of a matrix4 * vector4 operation,
67 * followed by scalars being applied to the resulting elements before all
68 * four are added together for the final sample. */
69 static const __m128 matrix[4] = {
70 { -0.5f, 1.0f, -0.5f, 0.0f },
71 { 1.5f, -2.5f, 0.0f, 1.0f },
72 { -1.5f, 2.0f, 0.5f, 0.0f },
73 { 0.5f, -0.5f, 0.0f, 0.0f },
75 ALIGN(16) float value[4];
76 ALuint pos = 0;
77 ALuint i;
79 for(i = 0;i < BufferSize+1;i++)
81 __m128 res1, res2;
82 ALfloat mu;
84 /* matrix * { samples } */
85 res1 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(data[(pos-1)*channels]), matrix[0]),
86 _mm_mul_ps(_mm_set1_ps(data[(pos )*channels]), matrix[1]));
87 res2 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(data[(pos+1)*channels]), matrix[2]),
88 _mm_mul_ps(_mm_set1_ps(data[(pos+2)*channels]), matrix[3]));
89 res1 = _mm_add_ps(res1, res2);
91 /* res1 * { mu^3, mu^2, mu^1, mu^0 } */
92 mu = frac * (1.0f/FRACTIONONE);
93 value[0] = mu*mu*mu;
94 value[1] = mu*mu;
95 value[2] = mu;
96 value[3] = 1.0f;
97 res1 = _mm_mul_ps(res1, _mm_load_ps(value));
99 _mm_store_ps(value, res1);
100 OutBuffer[i] = value[0] + value[1] + value[2] + value[3];
102 frac += increment;
103 pos += frac>>FRACTIONBITS;
104 frac &= FRACTIONMASK;
109 static __inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*RESTRICT Values)[2],
110 const ALuint IrSize,
111 ALfloat (*RESTRICT Coeffs)[2],
112 ALfloat (*RESTRICT CoeffStep)[2],
113 ALfloat left, ALfloat right)
115 const __m128 lrlr = { left, right, left, right };
116 __m128 coeffs, deltas, imp0, imp1;
117 __m128 vals = _mm_setzero_ps();
118 ALuint i;
120 if((Offset&1))
122 const ALuint o0 = Offset&HRIR_MASK;
123 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
125 coeffs = _mm_load_ps(&Coeffs[0][0]);
126 deltas = _mm_load_ps(&CoeffStep[0][0]);
127 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
128 imp0 = _mm_mul_ps(lrlr, coeffs);
129 coeffs = _mm_add_ps(coeffs, deltas);
130 vals = _mm_add_ps(imp0, vals);
131 _mm_store_ps(&Coeffs[0][0], coeffs);
132 _mm_storel_pi((__m64*)&Values[o0][0], vals);
133 for(i = 1;i < IrSize-1;i += 2)
135 const ALuint o2 = (Offset+i)&HRIR_MASK;
137 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
138 deltas = _mm_load_ps(&CoeffStep[i+1][0]);
139 vals = _mm_load_ps(&Values[o2][0]);
140 imp1 = _mm_mul_ps(lrlr, coeffs);
141 coeffs = _mm_add_ps(coeffs, deltas);
142 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
143 vals = _mm_add_ps(imp0, vals);
144 _mm_store_ps(&Coeffs[i+1][0], coeffs);
145 _mm_store_ps(&Values[o2][0], vals);
146 imp0 = imp1;
148 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
149 imp0 = _mm_movehl_ps(imp0, imp0);
150 vals = _mm_add_ps(imp0, vals);
151 _mm_storel_pi((__m64*)&Values[o1][0], vals);
153 else
155 for(i = 0;i < IrSize;i += 2)
157 const ALuint o = (Offset + i)&HRIR_MASK;
159 coeffs = _mm_load_ps(&Coeffs[i][0]);
160 deltas = _mm_load_ps(&CoeffStep[i][0]);
161 vals = _mm_load_ps(&Values[o][0]);
162 imp0 = _mm_mul_ps(lrlr, coeffs);
163 coeffs = _mm_add_ps(coeffs, deltas);
164 vals = _mm_add_ps(imp0, vals);
165 _mm_store_ps(&Coeffs[i][0], coeffs);
166 _mm_store_ps(&Values[o][0], vals);
171 static __inline void ApplyCoeffs(ALuint Offset, ALfloat (*RESTRICT Values)[2],
172 const ALuint IrSize,
173 ALfloat (*RESTRICT Coeffs)[2],
174 ALfloat left, ALfloat right)
176 const __m128 lrlr = { left, right, left, right };
177 __m128 vals = _mm_setzero_ps();
178 __m128 coeffs;
179 ALuint i;
181 if((Offset&1))
183 const ALuint o0 = Offset&HRIR_MASK;
184 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
185 __m128 imp0, imp1;
187 coeffs = _mm_load_ps(&Coeffs[0][0]);
188 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
189 imp0 = _mm_mul_ps(lrlr, coeffs);
190 vals = _mm_add_ps(imp0, vals);
191 _mm_storel_pi((__m64*)&Values[o0][0], vals);
192 for(i = 1;i < IrSize-1;i += 2)
194 const ALuint o2 = (Offset+i)&HRIR_MASK;
196 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
197 vals = _mm_load_ps(&Values[o2][0]);
198 imp1 = _mm_mul_ps(lrlr, coeffs);
199 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
200 vals = _mm_add_ps(imp0, vals);
201 _mm_store_ps(&Values[o2][0], vals);
202 imp0 = imp1;
204 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
205 imp0 = _mm_movehl_ps(imp0, imp0);
206 vals = _mm_add_ps(imp0, vals);
207 _mm_storel_pi((__m64*)&Values[o1][0], vals);
209 else
211 for(i = 0;i < IrSize;i += 2)
213 const ALuint o = (Offset + i)&HRIR_MASK;
215 coeffs = _mm_load_ps(&Coeffs[i][0]);
216 vals = _mm_load_ps(&Values[o][0]);
217 vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs));
218 _mm_store_ps(&Values[o][0], vals);
223 #define SUFFIX SSE
224 #include "mixer_inc.c"
225 #undef SUFFIX
228 void MixDirect_SSE(ALsource *Source, ALCdevice *Device, DirectParams *params,
229 const ALfloat *RESTRICT data, ALuint srcchan,
230 ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize)
232 ALfloat (*RESTRICT DryBuffer)[BUFFERSIZE] = Device->DryBuffer;
233 ALfloat *RESTRICT ClickRemoval = Device->ClickRemoval;
234 ALfloat *RESTRICT PendingClicks = Device->PendingClicks;
235 ALfloat DrySend[MaxChannels];
236 ALuint pos;
237 ALuint c;
238 (void)Source;
240 for(c = 0;c < MaxChannels;c++)
241 DrySend[c] = params->Gains[srcchan][c];
243 pos = 0;
244 if(OutPos == 0)
246 for(c = 0;c < MaxChannels;c++)
247 ClickRemoval[c] -= data[pos]*DrySend[c];
249 for(c = 0;c < MaxChannels;c++)
251 const __m128 gain = _mm_set1_ps(DrySend[c]);
252 for(pos = 0;pos < BufferSize-3;pos += 4)
254 const __m128 val4 = _mm_load_ps(&data[pos]);
255 __m128 dry4 = _mm_load_ps(&DryBuffer[c][OutPos+pos]);
256 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain));
257 _mm_store_ps(&DryBuffer[c][OutPos+pos], dry4);
260 if(pos < BufferSize)
262 ALuint oldpos = pos;
263 for(c = 0;c < MaxChannels;c++)
265 pos = oldpos;
266 for(;pos < BufferSize;pos++)
267 DryBuffer[c][OutPos+pos] += data[pos]*DrySend[c];
270 if(OutPos+pos == SamplesToDo)
272 for(c = 0;c < MaxChannels;c++)
273 PendingClicks[c] += data[pos]*DrySend[c];
278 void MixSend_SSE(SendParams *params, const ALfloat *RESTRICT data,
279 ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize)
281 ALeffectslot *Slot = params->Slot;
282 ALfloat (*RESTRICT WetBuffer)[BUFFERSIZE] = Slot->WetBuffer;
283 ALfloat *RESTRICT WetClickRemoval = Slot->ClickRemoval;
284 ALfloat *RESTRICT WetPendingClicks = Slot->PendingClicks;
285 const ALfloat WetGain = params->Gain;
286 const __m128 gain = _mm_set1_ps(WetGain);
287 ALuint pos;
289 pos = 0;
290 if(OutPos == 0)
291 WetClickRemoval[0] -= data[pos] * WetGain;
292 for(pos = 0;pos < BufferSize-3;pos+=4)
294 const __m128 val4 = _mm_load_ps(&data[pos]);
295 __m128 wet4 = _mm_load_ps(&WetBuffer[0][OutPos+pos]);
296 wet4 = _mm_add_ps(wet4, _mm_mul_ps(val4, gain));
297 _mm_store_ps(&WetBuffer[0][OutPos+pos], wet4);
299 for(;pos < BufferSize;pos++)
300 WetBuffer[0][OutPos+pos] += data[pos] * WetGain;
301 if(OutPos+pos == SamplesToDo)
302 WetPendingClicks[0] += data[pos] * WetGain;