Use a union to combine HRTF and non-HRTF mixer params
[openal-soft.git] / Alc / mixer_sse.c
blob9b04e8b509de637fd4bae110d87571dc23cc3596
1 #include "config.h"
3 #ifdef HAVE_XMMINTRIN_H
4 #ifdef IN_IDE_PARSER
5 /* KDevelop's parser won't recognize these defines that get added by the -msse
6 * switch used to compile this source. Without them, xmmintrin.h fails to
7 * declare anything. */
8 #define __MMX__
9 #define __SSE__
10 #endif
11 #include <xmmintrin.h>
12 #endif
14 #include "AL/al.h"
15 #include "AL/alc.h"
16 #include "alMain.h"
17 #include "alu.h"
19 #include "alSource.h"
20 #include "alAuxEffectSlot.h"
21 #include "mixer_defs.h"
24 static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
25 const ALuint IrSize,
26 ALfloat (*restrict Coeffs)[2],
27 const ALfloat (*restrict CoeffStep)[2],
28 ALfloat left, ALfloat right)
30 const __m128 lrlr = { left, right, left, right };
31 __m128 coeffs, deltas, imp0, imp1;
32 __m128 vals = _mm_setzero_ps();
33 ALuint i;
35 if((Offset&1))
37 const ALuint o0 = Offset&HRIR_MASK;
38 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
40 coeffs = _mm_load_ps(&Coeffs[0][0]);
41 deltas = _mm_load_ps(&CoeffStep[0][0]);
42 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
43 imp0 = _mm_mul_ps(lrlr, coeffs);
44 coeffs = _mm_add_ps(coeffs, deltas);
45 vals = _mm_add_ps(imp0, vals);
46 _mm_store_ps(&Coeffs[0][0], coeffs);
47 _mm_storel_pi((__m64*)&Values[o0][0], vals);
48 for(i = 1;i < IrSize-1;i += 2)
50 const ALuint o2 = (Offset+i)&HRIR_MASK;
52 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
53 deltas = _mm_load_ps(&CoeffStep[i+1][0]);
54 vals = _mm_load_ps(&Values[o2][0]);
55 imp1 = _mm_mul_ps(lrlr, coeffs);
56 coeffs = _mm_add_ps(coeffs, deltas);
57 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
58 vals = _mm_add_ps(imp0, vals);
59 _mm_store_ps(&Coeffs[i+1][0], coeffs);
60 _mm_store_ps(&Values[o2][0], vals);
61 imp0 = imp1;
63 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
64 imp0 = _mm_movehl_ps(imp0, imp0);
65 vals = _mm_add_ps(imp0, vals);
66 _mm_storel_pi((__m64*)&Values[o1][0], vals);
68 else
70 for(i = 0;i < IrSize;i += 2)
72 const ALuint o = (Offset + i)&HRIR_MASK;
74 coeffs = _mm_load_ps(&Coeffs[i][0]);
75 deltas = _mm_load_ps(&CoeffStep[i][0]);
76 vals = _mm_load_ps(&Values[o][0]);
77 imp0 = _mm_mul_ps(lrlr, coeffs);
78 coeffs = _mm_add_ps(coeffs, deltas);
79 vals = _mm_add_ps(imp0, vals);
80 _mm_store_ps(&Coeffs[i][0], coeffs);
81 _mm_store_ps(&Values[o][0], vals);
86 static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
87 const ALuint IrSize,
88 ALfloat (*restrict Coeffs)[2],
89 ALfloat left, ALfloat right)
91 const __m128 lrlr = { left, right, left, right };
92 __m128 vals = _mm_setzero_ps();
93 __m128 coeffs;
94 ALuint i;
96 if((Offset&1))
98 const ALuint o0 = Offset&HRIR_MASK;
99 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
100 __m128 imp0, imp1;
102 coeffs = _mm_load_ps(&Coeffs[0][0]);
103 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
104 imp0 = _mm_mul_ps(lrlr, coeffs);
105 vals = _mm_add_ps(imp0, vals);
106 _mm_storel_pi((__m64*)&Values[o0][0], vals);
107 for(i = 1;i < IrSize-1;i += 2)
109 const ALuint o2 = (Offset+i)&HRIR_MASK;
111 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
112 vals = _mm_load_ps(&Values[o2][0]);
113 imp1 = _mm_mul_ps(lrlr, coeffs);
114 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
115 vals = _mm_add_ps(imp0, vals);
116 _mm_store_ps(&Values[o2][0], vals);
117 imp0 = imp1;
119 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
120 imp0 = _mm_movehl_ps(imp0, imp0);
121 vals = _mm_add_ps(imp0, vals);
122 _mm_storel_pi((__m64*)&Values[o1][0], vals);
124 else
126 for(i = 0;i < IrSize;i += 2)
128 const ALuint o = (Offset + i)&HRIR_MASK;
130 coeffs = _mm_load_ps(&Coeffs[i][0]);
131 vals = _mm_load_ps(&Values[o][0]);
132 vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs));
133 _mm_store_ps(&Values[o][0], vals);
138 #define SUFFIX SSE
139 #include "mixer_inc.c"
140 #undef SUFFIX
143 void MixDirect_SSE(const DirectParams *params, const ALfloat *restrict data, ALuint srcchan,
144 ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize)
146 ALfloat (*restrict OutBuffer)[BUFFERSIZE] = params->OutBuffer;
147 ALfloat *restrict ClickRemoval = params->ClickRemoval;
148 ALfloat *restrict PendingClicks = params->PendingClicks;
149 ALfloat DrySend;
150 __m128 gain;
151 ALuint pos;
152 ALuint c;
154 for(c = 0;c < MaxChannels;c++)
156 DrySend = params->Mix.Gains[srcchan][c];
157 if(!(DrySend > GAIN_SILENCE_THRESHOLD))
158 continue;
160 if(OutPos == 0)
161 ClickRemoval[c] -= data[0]*DrySend;
163 gain = _mm_set1_ps(DrySend);
164 for(pos = 0;BufferSize-pos > 3;pos += 4)
166 const __m128 val4 = _mm_load_ps(&data[pos]);
167 __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
168 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain));
169 _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
171 for(;pos < BufferSize;pos++)
172 OutBuffer[c][OutPos+pos] += data[pos]*DrySend;
174 if(OutPos+pos == SamplesToDo)
175 PendingClicks[c] += data[pos]*DrySend;
180 void MixSend_SSE(const SendParams *params, const ALfloat *restrict data,
181 ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize)
183 ALfloat (*restrict OutBuffer)[BUFFERSIZE] = params->OutBuffer;
184 ALfloat *restrict ClickRemoval = params->ClickRemoval;
185 ALfloat *restrict PendingClicks = params->PendingClicks;
186 ALfloat WetGain;
187 __m128 gain;
188 ALuint pos;
190 WetGain = params->Gain;
191 if(!(WetGain > GAIN_SILENCE_THRESHOLD))
192 return;
194 if(OutPos == 0)
195 ClickRemoval[0] -= data[0] * WetGain;
197 gain = _mm_set1_ps(WetGain);
198 for(pos = 0;BufferSize-pos > 3;pos += 4)
200 const __m128 val4 = _mm_load_ps(&data[pos]);
201 __m128 wet4 = _mm_load_ps(&OutBuffer[0][OutPos+pos]);
202 wet4 = _mm_add_ps(wet4, _mm_mul_ps(val4, gain));
203 _mm_store_ps(&OutBuffer[0][OutPos+pos], wet4);
205 for(;pos < BufferSize;pos++)
206 OutBuffer[0][OutPos+pos] += data[pos] * WetGain;
208 if(OutPos+pos == SamplesToDo)
209 PendingClicks[0] += data[pos] * WetGain;