Use _mm_set_ps() to set an __m128 instead of {}
[openal-soft.git] / Alc / mixer_sse.c
blobc17a7e08fd328966053b0784d15a34e1656fbc67
1 #include "config.h"
3 #ifdef HAVE_XMMINTRIN_H
4 #ifdef IN_IDE_PARSER
5 /* KDevelop's parser won't recognize these defines that get added by the -msse
6 * switch used to compile this source. Without them, xmmintrin.h fails to
7 * declare anything. */
8 #define __MMX__
9 #define __SSE__
10 #endif
11 #include <xmmintrin.h>
12 #endif
14 #include "AL/al.h"
15 #include "AL/alc.h"
16 #include "alMain.h"
17 #include "alu.h"
19 #include "alSource.h"
20 #include "alAuxEffectSlot.h"
21 #include "mixer_defs.h"
24 static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
25 const ALuint IrSize,
26 ALfloat (*restrict Coeffs)[2],
27 const ALfloat (*restrict CoeffStep)[2],
28 ALfloat left, ALfloat right)
30 const __m128 lrlr = _mm_set_ps(left, right, left, right);
31 __m128 coeffs, deltas, imp0, imp1;
32 __m128 vals = _mm_setzero_ps();
33 ALuint i;
35 if((Offset&1))
37 const ALuint o0 = Offset&HRIR_MASK;
38 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
40 coeffs = _mm_load_ps(&Coeffs[0][0]);
41 deltas = _mm_load_ps(&CoeffStep[0][0]);
42 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
43 imp0 = _mm_mul_ps(lrlr, coeffs);
44 coeffs = _mm_add_ps(coeffs, deltas);
45 vals = _mm_add_ps(imp0, vals);
46 _mm_store_ps(&Coeffs[0][0], coeffs);
47 _mm_storel_pi((__m64*)&Values[o0][0], vals);
48 for(i = 1;i < IrSize-1;i += 2)
50 const ALuint o2 = (Offset+i)&HRIR_MASK;
52 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
53 deltas = _mm_load_ps(&CoeffStep[i+1][0]);
54 vals = _mm_load_ps(&Values[o2][0]);
55 imp1 = _mm_mul_ps(lrlr, coeffs);
56 coeffs = _mm_add_ps(coeffs, deltas);
57 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
58 vals = _mm_add_ps(imp0, vals);
59 _mm_store_ps(&Coeffs[i+1][0], coeffs);
60 _mm_store_ps(&Values[o2][0], vals);
61 imp0 = imp1;
63 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
64 imp0 = _mm_movehl_ps(imp0, imp0);
65 vals = _mm_add_ps(imp0, vals);
66 _mm_storel_pi((__m64*)&Values[o1][0], vals);
68 else
70 for(i = 0;i < IrSize;i += 2)
72 const ALuint o = (Offset + i)&HRIR_MASK;
74 coeffs = _mm_load_ps(&Coeffs[i][0]);
75 deltas = _mm_load_ps(&CoeffStep[i][0]);
76 vals = _mm_load_ps(&Values[o][0]);
77 imp0 = _mm_mul_ps(lrlr, coeffs);
78 coeffs = _mm_add_ps(coeffs, deltas);
79 vals = _mm_add_ps(imp0, vals);
80 _mm_store_ps(&Coeffs[i][0], coeffs);
81 _mm_store_ps(&Values[o][0], vals);
86 static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
87 const ALuint IrSize,
88 ALfloat (*restrict Coeffs)[2],
89 ALfloat left, ALfloat right)
91 const __m128 lrlr = _mm_set_ps(left, right, left, right);
92 __m128 vals = _mm_setzero_ps();
93 __m128 coeffs;
94 ALuint i;
96 if((Offset&1))
98 const ALuint o0 = Offset&HRIR_MASK;
99 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
100 __m128 imp0, imp1;
102 coeffs = _mm_load_ps(&Coeffs[0][0]);
103 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
104 imp0 = _mm_mul_ps(lrlr, coeffs);
105 vals = _mm_add_ps(imp0, vals);
106 _mm_storel_pi((__m64*)&Values[o0][0], vals);
107 for(i = 1;i < IrSize-1;i += 2)
109 const ALuint o2 = (Offset+i)&HRIR_MASK;
111 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
112 vals = _mm_load_ps(&Values[o2][0]);
113 imp1 = _mm_mul_ps(lrlr, coeffs);
114 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
115 vals = _mm_add_ps(imp0, vals);
116 _mm_store_ps(&Values[o2][0], vals);
117 imp0 = imp1;
119 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
120 imp0 = _mm_movehl_ps(imp0, imp0);
121 vals = _mm_add_ps(imp0, vals);
122 _mm_storel_pi((__m64*)&Values[o1][0], vals);
124 else
126 for(i = 0;i < IrSize;i += 2)
128 const ALuint o = (Offset + i)&HRIR_MASK;
130 coeffs = _mm_load_ps(&Coeffs[i][0]);
131 vals = _mm_load_ps(&Values[o][0]);
132 vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs));
133 _mm_store_ps(&Values[o][0], vals);
138 #define SUFFIX SSE
139 #include "mixer_inc.c"
140 #undef SUFFIX
143 void MixDirect_SSE(DirectParams *params, const ALfloat *restrict data, ALuint srcchan,
144 ALuint OutPos, ALuint BufferSize)
146 ALfloat (*restrict OutBuffer)[BUFFERSIZE] = params->OutBuffer;
147 ALuint Counter = maxu(params->Counter, OutPos) - OutPos;
148 ALfloat DrySend, Step;
149 __m128 gain, step;
150 ALuint c;
152 for(c = 0;c < MaxChannels;c++)
154 ALuint pos = 0;
155 Step = params->Mix.Gains.Step[srcchan][c];
156 if(Step != 1.0f && Counter > 0)
158 DrySend = params->Mix.Gains.Current[srcchan][c];
159 if(BufferSize-pos > 3 && Counter-pos > 3)
161 gain = _mm_set_ps(
162 DrySend,
163 DrySend * Step,
164 DrySend * Step * Step,
165 DrySend * Step * Step * Step
167 step = _mm_set1_ps(Step * Step * Step * Step);
168 do {
169 const __m128 val4 = _mm_load_ps(&data[pos]);
170 __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
171 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain));
172 gain = _mm_mul_ps(gain, step);
173 _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
174 pos += 4;
175 } while(BufferSize-pos > 3 && Counter-pos > 3);
176 DrySend = _mm_cvtss_f32(_mm_shuffle_ps(gain, gain, _MM_SHUFFLE(3, 3, 3, 3)));
178 if(!(BufferSize-pos > 3))
180 for(;pos < BufferSize && pos < Counter;pos++)
182 OutBuffer[c][OutPos+pos] += data[pos]*DrySend;
183 DrySend *= Step;
186 params->Mix.Gains.Current[srcchan][c] = DrySend;
189 DrySend = params->Mix.Gains.Target[srcchan][c];
190 if(!(DrySend > GAIN_SILENCE_THRESHOLD))
191 continue;
192 gain = _mm_set1_ps(DrySend);
193 for(;BufferSize-pos > 3;pos += 4)
195 const __m128 val4 = _mm_load_ps(&data[pos]);
196 __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
197 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain));
198 _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
200 for(;pos < BufferSize;pos++)
201 OutBuffer[c][OutPos+pos] += data[pos]*DrySend;
206 void MixSend_SSE(SendParams *params, const ALfloat *restrict data,
207 ALuint OutPos, ALuint BufferSize)
209 ALfloat (*restrict OutBuffer)[BUFFERSIZE] = params->OutBuffer;
210 ALuint Counter = maxu(params->Counter, OutPos) - OutPos;
211 ALfloat WetGain, Step;
212 __m128 gain, step;
215 ALuint pos = 0;
217 Step = params->Gain.Step;
218 if(Step != 1.0f && Counter > 0)
220 WetGain = params->Gain.Current;
221 if(BufferSize-pos > 3 && Counter-pos > 3)
223 gain = _mm_set_ps(
224 WetGain,
225 WetGain * Step,
226 WetGain * Step * Step,
227 WetGain * Step * Step * Step
229 step = _mm_set1_ps(Step * Step * Step * Step);
230 do {
231 const __m128 val4 = _mm_load_ps(&data[pos]);
232 __m128 dry4 = _mm_load_ps(&OutBuffer[0][OutPos+pos]);
233 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain));
234 gain = _mm_mul_ps(gain, step);
235 _mm_store_ps(&OutBuffer[0][OutPos+pos], dry4);
236 pos += 4;
237 } while(BufferSize-pos > 3 && Counter-pos > 3);
238 WetGain = _mm_cvtss_f32(_mm_shuffle_ps(gain, gain, _MM_SHUFFLE(3, 3, 3, 3)));
240 if(!(BufferSize-pos > 3))
242 for(;pos < BufferSize && pos < Counter;pos++)
244 OutBuffer[0][OutPos+pos] += data[pos]*WetGain;
245 WetGain *= Step;
248 params->Gain.Current = WetGain;
251 WetGain = params->Gain.Target;
252 if(!(WetGain > GAIN_SILENCE_THRESHOLD))
253 return;
254 gain = _mm_set1_ps(WetGain);
255 for(;BufferSize-pos > 3;pos += 4)
257 const __m128 val4 = _mm_load_ps(&data[pos]);
258 __m128 wet4 = _mm_load_ps(&OutBuffer[0][OutPos+pos]);
259 wet4 = _mm_add_ps(wet4, _mm_mul_ps(val4, gain));
260 _mm_store_ps(&OutBuffer[0][OutPos+pos], wet4);
262 for(;pos < BufferSize;pos++)
263 OutBuffer[0][OutPos+pos] += data[pos] * WetGain;