3 #ifdef HAVE_XMMINTRIN_H
13 #include "alAuxEffectSlot.h"
14 #include "mixer_defs.h"
16 static __inline ALfloat
lerp32(const ALfloat
*vals
, ALint step
, ALuint frac
)
17 { return lerp(vals
[0], vals
[step
], frac
* (1.0f
/FRACTIONONE
)); }
19 void Resample_lerp32_SSE(const ALfloat
*data
, ALuint frac
,
20 ALuint increment
, ALuint NumChannels
, ALfloat
*RESTRICT OutBuffer
,
23 ALIGN(16) float value
[3][4];
27 for(i
= 0;i
< BufferSize
+1-3;i
+=4)
32 value
[0][j
] = data
[(pos
)*NumChannels
];
33 value
[1][j
] = data
[(pos
+1)*NumChannels
];
34 value
[2][j
] = frac
* (1.0f
/FRACTIONONE
);
37 pos
+= frac
>>FRACTIONBITS
;
41 x
= _mm_load_ps(value
[0]);
42 y
= _mm_load_ps(value
[1]);
45 a
= _mm_load_ps(value
[2]);
50 _mm_store_ps(&OutBuffer
[i
], x
);
52 for(;i
< BufferSize
+1;i
++)
54 OutBuffer
[i
] = lerp32(data
+ pos
*NumChannels
, NumChannels
, frac
);
57 pos
+= frac
>>FRACTIONBITS
;
62 void Resample_cubic32_SSE(const ALfloat
*data
, ALuint frac
,
63 ALuint increment
, ALuint channels
, ALfloat
*RESTRICT OutBuffer
,
66 /* Cubic interpolation mainly consists of a matrix4 * vector4 operation,
67 * followed by scalars being applied to the resulting elements before all
68 * four are added together for the final sample. */
69 static const __m128 matrix
[4] = {
70 { -0.5f
, 1.0f
, -0.5f
, 0.0f
},
71 { 1.5f
, -2.5f
, 0.0f
, 1.0f
},
72 { -1.5f
, 2.0f
, 0.5f
, 0.0f
},
73 { 0.5f
, -0.5f
, 0.0f
, 0.0f
},
75 ALIGN(16) float value
[4];
79 for(i
= 0;i
< BufferSize
+1;i
++)
84 /* matrix * { samples } */
85 res1
= _mm_add_ps(_mm_mul_ps(_mm_set1_ps(data
[(pos
-1)*channels
]), matrix
[0]),
86 _mm_mul_ps(_mm_set1_ps(data
[(pos
)*channels
]), matrix
[1]));
87 res2
= _mm_add_ps(_mm_mul_ps(_mm_set1_ps(data
[(pos
+1)*channels
]), matrix
[2]),
88 _mm_mul_ps(_mm_set1_ps(data
[(pos
+2)*channels
]), matrix
[3]));
89 res1
= _mm_add_ps(res1
, res2
);
91 /* res1 * { mu^3, mu^2, mu^1, mu^0 } */
92 mu
= frac
* (1.0f
/FRACTIONONE
);
97 res1
= _mm_mul_ps(res1
, _mm_load_ps(value
));
99 _mm_store_ps(value
, res1
);
100 OutBuffer
[i
] = value
[0] + value
[1] + value
[2] + value
[3];
103 pos
+= frac
>>FRACTIONBITS
;
104 frac
&= FRACTIONMASK
;
109 static __inline
void ApplyCoeffsStep(ALuint Offset
, ALfloat (*RESTRICT Values
)[2],
111 ALfloat (*RESTRICT Coeffs
)[2],
112 ALfloat (*RESTRICT CoeffStep
)[2],
113 ALfloat left
, ALfloat right
)
115 const __m128 lrlr
= { left
, right
, left
, right
};
116 __m128 coeffs
, deltas
, imp0
, imp1
;
117 __m128 vals
= _mm_setzero_ps();
122 const ALuint o0
= Offset
&HRIR_MASK
;
123 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
125 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
126 deltas
= _mm_load_ps(&CoeffStep
[0][0]);
127 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
128 imp0
= _mm_mul_ps(lrlr
, coeffs
);
129 coeffs
= _mm_add_ps(coeffs
, deltas
);
130 vals
= _mm_add_ps(imp0
, vals
);
131 _mm_store_ps(&Coeffs
[0][0], coeffs
);
132 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
133 for(i
= 1;i
< IrSize
-1;i
+= 2)
135 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
137 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
138 deltas
= _mm_load_ps(&CoeffStep
[i
+1][0]);
139 vals
= _mm_load_ps(&Values
[o2
][0]);
140 imp1
= _mm_mul_ps(lrlr
, coeffs
);
141 coeffs
= _mm_add_ps(coeffs
, deltas
);
142 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
143 vals
= _mm_add_ps(imp0
, vals
);
144 _mm_store_ps(&Coeffs
[i
+1][0], coeffs
);
145 _mm_store_ps(&Values
[o2
][0], vals
);
148 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
149 imp0
= _mm_movehl_ps(imp0
, imp0
);
150 vals
= _mm_add_ps(imp0
, vals
);
151 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
155 for(i
= 0;i
< IrSize
;i
+= 2)
157 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
159 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
160 deltas
= _mm_load_ps(&CoeffStep
[i
][0]);
161 vals
= _mm_load_ps(&Values
[o
][0]);
162 imp0
= _mm_mul_ps(lrlr
, coeffs
);
163 coeffs
= _mm_add_ps(coeffs
, deltas
);
164 vals
= _mm_add_ps(imp0
, vals
);
165 _mm_store_ps(&Coeffs
[i
][0], coeffs
);
166 _mm_store_ps(&Values
[o
][0], vals
);
171 static __inline
void ApplyCoeffs(ALuint Offset
, ALfloat (*RESTRICT Values
)[2],
173 ALfloat (*RESTRICT Coeffs
)[2],
174 ALfloat left
, ALfloat right
)
176 const __m128 lrlr
= { left
, right
, left
, right
};
177 __m128 vals
= _mm_setzero_ps();
183 const ALuint o0
= Offset
&HRIR_MASK
;
184 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
187 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
188 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
189 imp0
= _mm_mul_ps(lrlr
, coeffs
);
190 vals
= _mm_add_ps(imp0
, vals
);
191 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
192 for(i
= 1;i
< IrSize
-1;i
+= 2)
194 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
196 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
197 vals
= _mm_load_ps(&Values
[o2
][0]);
198 imp1
= _mm_mul_ps(lrlr
, coeffs
);
199 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
200 vals
= _mm_add_ps(imp0
, vals
);
201 _mm_store_ps(&Values
[o2
][0], vals
);
204 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
205 imp0
= _mm_movehl_ps(imp0
, imp0
);
206 vals
= _mm_add_ps(imp0
, vals
);
207 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
211 for(i
= 0;i
< IrSize
;i
+= 2)
213 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
215 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
216 vals
= _mm_load_ps(&Values
[o
][0]);
217 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
218 _mm_store_ps(&Values
[o
][0], vals
);
224 #include "mixer_inc.c"
228 void MixDirect_SSE(ALsource
*Source
, ALCdevice
*Device
, DirectParams
*params
,
229 const ALfloat
*RESTRICT data
, ALuint srcchan
,
230 ALuint OutPos
, ALuint SamplesToDo
, ALuint BufferSize
)
232 ALfloat (*RESTRICT DryBuffer
)[BUFFERSIZE
] = Device
->DryBuffer
;
233 ALfloat
*RESTRICT ClickRemoval
= Device
->ClickRemoval
;
234 ALfloat
*RESTRICT PendingClicks
= Device
->PendingClicks
;
235 ALfloat DrySend
[MaxChannels
];
240 for(c
= 0;c
< MaxChannels
;c
++)
241 DrySend
[c
] = params
->Gains
[srcchan
][c
];
246 for(c
= 0;c
< MaxChannels
;c
++)
247 ClickRemoval
[c
] -= data
[pos
]*DrySend
[c
];
249 for(c
= 0;c
< MaxChannels
;c
++)
251 const __m128 gain
= _mm_set1_ps(DrySend
[c
]);
252 for(pos
= 0;pos
< BufferSize
-3;pos
+= 4)
254 const __m128 val4
= _mm_load_ps(&data
[pos
]);
255 __m128 dry4
= _mm_load_ps(&DryBuffer
[c
][OutPos
+pos
]);
256 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain
));
257 _mm_store_ps(&DryBuffer
[c
][OutPos
+pos
], dry4
);
263 for(c
= 0;c
< MaxChannels
;c
++)
266 for(;pos
< BufferSize
;pos
++)
267 DryBuffer
[c
][OutPos
+pos
] += data
[pos
]*DrySend
[c
];
270 if(OutPos
+pos
== SamplesToDo
)
272 for(c
= 0;c
< MaxChannels
;c
++)
273 PendingClicks
[c
] += data
[pos
]*DrySend
[c
];
278 void MixSend_SSE(SendParams
*params
, const ALfloat
*RESTRICT data
,
279 ALuint OutPos
, ALuint SamplesToDo
, ALuint BufferSize
)
281 ALeffectslot
*Slot
= params
->Slot
;
282 ALfloat (*RESTRICT WetBuffer
)[BUFFERSIZE
] = Slot
->WetBuffer
;
283 ALfloat
*RESTRICT WetClickRemoval
= Slot
->ClickRemoval
;
284 ALfloat
*RESTRICT WetPendingClicks
= Slot
->PendingClicks
;
285 const ALfloat WetGain
= params
->Gain
;
286 const __m128 gain
= _mm_set1_ps(WetGain
);
291 WetClickRemoval
[0] -= data
[pos
] * WetGain
;
292 for(pos
= 0;pos
< BufferSize
-3;pos
+=4)
294 const __m128 val4
= _mm_load_ps(&data
[pos
]);
295 __m128 wet4
= _mm_load_ps(&WetBuffer
[0][OutPos
+pos
]);
296 wet4
= _mm_add_ps(wet4
, _mm_mul_ps(val4
, gain
));
297 _mm_store_ps(&WetBuffer
[0][OutPos
+pos
], wet4
);
299 for(;pos
< BufferSize
;pos
++)
300 WetBuffer
[0][OutPos
+pos
] += data
[pos
] * WetGain
;
301 if(OutPos
+pos
== SamplesToDo
)
302 WetPendingClicks
[0] += data
[pos
] * WetGain
;