3 #ifdef HAVE_XMMINTRIN_H
13 #include "alAuxEffectSlot.h"
14 #include "mixer_defs.h"
16 static __inline ALfloat
lerp32(const ALfloat
*vals
, ALint step
, ALuint frac
)
17 { return lerp(vals
[0], vals
[step
], frac
* (1.0f
/FRACTIONONE
)); }
19 void Resample_lerp32_SSE(const ALfloat
*data
, ALuint frac
,
20 ALuint increment
, ALuint NumChannels
, ALfloat
*RESTRICT OutBuffer
,
23 ALIGN(16) float value
[3][4];
27 for(i
= 0;i
< BufferSize
+1-3;i
+=4)
32 value
[0][j
] = data
[(pos
)*NumChannels
];
33 value
[1][j
] = data
[(pos
+1)*NumChannels
];
34 value
[2][j
] = frac
* (1.0f
/FRACTIONONE
);
37 pos
+= frac
>>FRACTIONBITS
;
41 x
= _mm_load_ps(value
[0]);
42 y
= _mm_load_ps(value
[1]);
45 a
= _mm_load_ps(value
[2]);
50 _mm_store_ps(&OutBuffer
[i
], x
);
52 for(;i
< BufferSize
+1;i
++)
54 OutBuffer
[i
] = lerp32(data
+ pos
*NumChannels
, NumChannels
, frac
);
57 pos
+= frac
>>FRACTIONBITS
;
62 void Resample_cubic32_SSE(const ALfloat
*data
, ALuint frac
,
63 ALuint increment
, ALuint NumChannels
, ALfloat
*RESTRICT OutBuffer
,
66 /* Cubic interpolation mainly consists of a matrix4 * vector4 operation,
67 * followed by scalars being applied to the resulting elements before all
68 * four are added together for the final sample. */
69 static const __m128 matrix
[4] = {
70 { -0.5, 1.0f
, -0.5f
, 0.0f
},
71 { 1.5, -2.5f
, 0.0f
, 1.0f
},
72 { -1.5, 2.0f
, 0.5f
, 0.0f
},
73 { 0.5, -0.5f
, 0.0f
, 0.0f
},
75 ALIGN(16) float value
[4];
79 for(i
= 0;i
< BufferSize
+1-3;i
+=4)
81 __m128 result
, final
[4];
88 val4
= _mm_set_ps(data
[(pos
-1)*NumChannels
],
89 data
[(pos
)*NumChannels
],
90 data
[(pos
+1)*NumChannels
],
91 data
[(pos
+2)*NumChannels
]);
92 mu
= frac
* (1.0f
/FRACTIONONE
);
93 s
= _mm_set_ps(1.0f
, mu
, mu
*mu
, mu
*mu
*mu
);
95 /* result = matrix * val4 */
96 result
= _mm_mul_ps(val4
, matrix
[0]) ;
97 result
= _mm_add_ps(result
, _mm_mul_ps(val4
, matrix
[1]));
98 result
= _mm_add_ps(result
, _mm_mul_ps(val4
, matrix
[2]));
99 result
= _mm_add_ps(result
, _mm_mul_ps(val4
, matrix
[3]));
101 /* final[j] = result * { mu^0, mu^1, mu^2, mu^3 } */
102 final
[j
] = _mm_mul_ps(result
, s
);
105 pos
+= frac
>>FRACTIONBITS
;
106 frac
&= FRACTIONMASK
;
108 /* Transpose the final "matrix" so adding the rows will give the four
109 * samples. TODO: Is this faster than doing..
110 * _mm_store_ps(value, result);
111 * OutBuffer[i] = value[0] + value[1] + value[2] + value[3];
114 _MM_TRANSPOSE4_PS(final
[0], final
[1], final
[2], final
[3]);
115 result
= _mm_add_ps(_mm_add_ps(final
[0], final
[1]),
116 _mm_add_ps(final
[2], final
[3]));
118 _mm_store_ps(&OutBuffer
[i
], result
);
120 for(;i
< BufferSize
+1;i
++)
122 __m128 val4
, s
, result
;
125 val4
= _mm_set_ps(data
[(pos
-1)*NumChannels
],
126 data
[(pos
)*NumChannels
],
127 data
[(pos
+1)*NumChannels
],
128 data
[(pos
+2)*NumChannels
]);
129 mu
= frac
* (1.0f
/FRACTIONONE
);
130 s
= _mm_set_ps(1.0f
, mu
, mu
*mu
, mu
*mu
*mu
);
132 /* result = matrix * val4 */
133 result
= _mm_mul_ps(val4
, matrix
[0]) ;
134 result
= _mm_add_ps(result
, _mm_mul_ps(val4
, matrix
[1]));
135 result
= _mm_add_ps(result
, _mm_mul_ps(val4
, matrix
[2]));
136 result
= _mm_add_ps(result
, _mm_mul_ps(val4
, matrix
[3]));
138 /* value = result * { mu^0, mu^1, mu^2, mu^3 } */
139 _mm_store_ps(value
, _mm_mul_ps(result
, s
));
141 OutBuffer
[i
] = value
[0] + value
[1] + value
[2] + value
[3];
144 pos
+= frac
>>FRACTIONBITS
;
145 frac
&= FRACTIONMASK
;
150 static __inline
void ApplyCoeffsStep(ALuint Offset
, ALfloat (*RESTRICT Values
)[2],
152 ALfloat (*RESTRICT Coeffs
)[2],
153 ALfloat (*RESTRICT CoeffStep
)[2],
154 ALfloat left
, ALfloat right
)
156 const __m128 lrlr
= { left
, right
, left
, right
};
157 __m128 coeffs
, deltas
, imp0
, imp1
;
158 __m128 vals
= _mm_setzero_ps();
163 const ALuint o0
= Offset
&HRIR_MASK
;
164 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
166 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
167 deltas
= _mm_load_ps(&CoeffStep
[0][0]);
168 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
169 imp0
= _mm_mul_ps(lrlr
, coeffs
);
170 coeffs
= _mm_add_ps(coeffs
, deltas
);
171 vals
= _mm_add_ps(imp0
, vals
);
172 _mm_store_ps(&Coeffs
[0][0], coeffs
);
173 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
174 for(i
= 1;i
< IrSize
-1;i
+= 2)
176 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
178 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
179 deltas
= _mm_load_ps(&CoeffStep
[i
+1][0]);
180 vals
= _mm_load_ps(&Values
[o2
][0]);
181 imp1
= _mm_mul_ps(lrlr
, coeffs
);
182 coeffs
= _mm_add_ps(coeffs
, deltas
);
183 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
184 vals
= _mm_add_ps(imp0
, vals
);
185 _mm_store_ps(&Coeffs
[i
+1][0], coeffs
);
186 _mm_store_ps(&Values
[o2
][0], vals
);
189 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
190 imp0
= _mm_movehl_ps(imp0
, imp0
);
191 vals
= _mm_add_ps(imp0
, vals
);
192 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
196 for(i
= 0;i
< IrSize
;i
+= 2)
198 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
200 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
201 deltas
= _mm_load_ps(&CoeffStep
[i
][0]);
202 vals
= _mm_load_ps(&Values
[o
][0]);
203 imp0
= _mm_mul_ps(lrlr
, coeffs
);
204 coeffs
= _mm_add_ps(coeffs
, deltas
);
205 vals
= _mm_add_ps(imp0
, vals
);
206 _mm_store_ps(&Coeffs
[i
][0], coeffs
);
207 _mm_store_ps(&Values
[o
][0], vals
);
212 static __inline
void ApplyCoeffs(ALuint Offset
, ALfloat (*RESTRICT Values
)[2],
214 ALfloat (*RESTRICT Coeffs
)[2],
215 ALfloat left
, ALfloat right
)
217 const __m128 lrlr
= { left
, right
, left
, right
};
218 __m128 vals
= _mm_setzero_ps();
224 const ALuint o0
= Offset
&HRIR_MASK
;
225 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
228 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
229 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
230 imp0
= _mm_mul_ps(lrlr
, coeffs
);
231 vals
= _mm_add_ps(imp0
, vals
);
232 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
233 for(i
= 1;i
< IrSize
-1;i
+= 2)
235 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
237 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
238 vals
= _mm_load_ps(&Values
[o2
][0]);
239 imp1
= _mm_mul_ps(lrlr
, coeffs
);
240 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
241 vals
= _mm_add_ps(imp0
, vals
);
242 _mm_store_ps(&Values
[o2
][0], vals
);
245 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
246 imp0
= _mm_movehl_ps(imp0
, imp0
);
247 vals
= _mm_add_ps(imp0
, vals
);
248 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
252 for(i
= 0;i
< IrSize
;i
+= 2)
254 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
256 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
257 vals
= _mm_load_ps(&Values
[o
][0]);
258 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
259 _mm_store_ps(&Values
[o
][0], vals
);
265 #include "mixer_inc.c"
269 void MixDirect_SSE(ALsource
*Source
, ALCdevice
*Device
, DirectParams
*params
,
270 const ALfloat
*RESTRICT data
, ALuint srcchan
,
271 ALuint OutPos
, ALuint SamplesToDo
, ALuint BufferSize
)
273 ALfloat (*RESTRICT DryBuffer
)[BUFFERSIZE
] = Device
->DryBuffer
;
274 ALfloat
*RESTRICT ClickRemoval
= Device
->ClickRemoval
;
275 ALfloat
*RESTRICT PendingClicks
= Device
->PendingClicks
;
276 ALfloat DrySend
[MaxChannels
];
281 for(c
= 0;c
< MaxChannels
;c
++)
282 DrySend
[c
] = params
->Gains
[srcchan
][c
];
287 for(c
= 0;c
< MaxChannels
;c
++)
288 ClickRemoval
[c
] -= data
[pos
]*DrySend
[c
];
290 for(c
= 0;c
< MaxChannels
;c
++)
292 const __m128 gain
= _mm_set1_ps(DrySend
[c
]);
293 for(pos
= 0;pos
< BufferSize
-3;pos
+= 4)
295 const __m128 val4
= _mm_load_ps(&data
[pos
]);
296 __m128 dry4
= _mm_load_ps(&DryBuffer
[c
][OutPos
+pos
]);
297 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain
));
298 _mm_store_ps(&DryBuffer
[c
][OutPos
+pos
], dry4
);
304 for(c
= 0;c
< MaxChannels
;c
++)
307 for(;pos
< BufferSize
;pos
++)
308 DryBuffer
[c
][OutPos
+pos
] += data
[pos
]*DrySend
[c
];
311 if(OutPos
+pos
== SamplesToDo
)
313 for(c
= 0;c
< MaxChannels
;c
++)
314 PendingClicks
[c
] += data
[pos
]*DrySend
[c
];
319 void MixSend_SSE(SendParams
*params
, const ALfloat
*RESTRICT data
,
320 ALuint OutPos
, ALuint SamplesToDo
, ALuint BufferSize
)
322 ALeffectslot
*Slot
= params
->Slot
;
323 ALfloat
*RESTRICT WetBuffer
= Slot
->WetBuffer
;
324 ALfloat
*RESTRICT WetClickRemoval
= Slot
->ClickRemoval
;
325 ALfloat
*RESTRICT WetPendingClicks
= Slot
->PendingClicks
;
326 const ALfloat WetGain
= params
->Gain
;
327 const __m128 gain
= _mm_set1_ps(WetGain
);
332 WetClickRemoval
[0] -= data
[pos
] * WetGain
;
333 for(pos
= 0;pos
< BufferSize
-3;pos
+=4)
335 const __m128 val4
= _mm_load_ps(&data
[pos
]);
336 __m128 wet4
= _mm_load_ps(&WetBuffer
[OutPos
+pos
]);
337 wet4
= _mm_add_ps(wet4
, _mm_mul_ps(val4
, gain
));
338 _mm_store_ps(&WetBuffer
[OutPos
+pos
], wet4
);
340 for(;pos
< BufferSize
;pos
++)
341 WetBuffer
[OutPos
+pos
] += data
[pos
] * WetGain
;
342 if(OutPos
== SamplesToDo
)
343 WetPendingClicks
[0] += data
[pos
] * WetGain
;