Do the filtering separately from the mixing
[openal-soft.git] / Alc / mixer_sse.c
blob434857c1c836c04e3b43f8ba2dad37f26594f255
1 #include "config.h"
3 #ifdef HAVE_XMMINTRIN_H
4 #include <xmmintrin.h>
5 #endif
7 #include "AL/al.h"
8 #include "AL/alc.h"
9 #include "alMain.h"
10 #include "alu.h"
12 #include "alSource.h"
13 #include "mixer_defs.h"
16 static __inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*RESTRICT Values)[2],
17 const ALuint IrSize,
18 ALfloat (*RESTRICT Coeffs)[2],
19 ALfloat (*RESTRICT CoeffStep)[2],
20 ALfloat left, ALfloat right)
22 const __m128 lrlr = { left, right, left, right };
23 __m128 coeffs, deltas, imp0, imp1;
24 __m128 vals = _mm_setzero_ps();
25 ALuint i;
27 if((Offset&1))
29 const ALuint o0 = Offset&HRIR_MASK;
30 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
32 coeffs = _mm_load_ps(&Coeffs[0][0]);
33 deltas = _mm_load_ps(&CoeffStep[0][0]);
34 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
35 imp0 = _mm_mul_ps(lrlr, coeffs);
36 coeffs = _mm_add_ps(coeffs, deltas);
37 vals = _mm_add_ps(imp0, vals);
38 _mm_store_ps(&Coeffs[0][0], coeffs);
39 _mm_storel_pi((__m64*)&Values[o0][0], vals);
40 for(i = 1;i < IrSize-1;i += 2)
42 const ALuint o2 = (Offset+i)&HRIR_MASK;
44 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
45 deltas = _mm_load_ps(&CoeffStep[i+1][0]);
46 vals = _mm_load_ps(&Values[o2][0]);
47 imp1 = _mm_mul_ps(lrlr, coeffs);
48 coeffs = _mm_add_ps(coeffs, deltas);
49 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
50 vals = _mm_add_ps(imp0, vals);
51 _mm_store_ps(&Coeffs[i+1][0], coeffs);
52 _mm_store_ps(&Values[o2][0], vals);
53 imp0 = imp1;
55 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
56 imp0 = _mm_movehl_ps(imp0, imp0);
57 vals = _mm_add_ps(imp0, vals);
58 _mm_storel_pi((__m64*)&Values[o1][0], vals);
60 else
62 for(i = 0;i < IrSize;i += 2)
64 const ALuint o = (Offset + i)&HRIR_MASK;
66 coeffs = _mm_load_ps(&Coeffs[i][0]);
67 deltas = _mm_load_ps(&CoeffStep[i][0]);
68 vals = _mm_load_ps(&Values[o][0]);
69 imp0 = _mm_mul_ps(lrlr, coeffs);
70 coeffs = _mm_add_ps(coeffs, deltas);
71 vals = _mm_add_ps(imp0, vals);
72 _mm_store_ps(&Coeffs[i][0], coeffs);
73 _mm_store_ps(&Values[o][0], vals);
78 static __inline void ApplyCoeffs(ALuint Offset, ALfloat (*RESTRICT Values)[2],
79 const ALuint IrSize,
80 ALfloat (*RESTRICT Coeffs)[2],
81 ALfloat left, ALfloat right)
83 const __m128 lrlr = { left, right, left, right };
84 __m128 vals = _mm_setzero_ps();
85 __m128 coeffs;
86 ALuint i;
88 if((Offset&1))
90 const ALuint o0 = Offset&HRIR_MASK;
91 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
92 __m128 imp0, imp1;
94 coeffs = _mm_load_ps(&Coeffs[0][0]);
95 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
96 imp0 = _mm_mul_ps(lrlr, coeffs);
97 vals = _mm_add_ps(imp0, vals);
98 _mm_storel_pi((__m64*)&Values[o0][0], vals);
99 for(i = 1;i < IrSize-1;i += 2)
101 const ALuint o2 = (Offset+i)&HRIR_MASK;
103 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
104 vals = _mm_load_ps(&Values[o2][0]);
105 imp1 = _mm_mul_ps(lrlr, coeffs);
106 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
107 vals = _mm_add_ps(imp0, vals);
108 _mm_store_ps(&Values[o2][0], vals);
109 imp0 = imp1;
111 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
112 imp0 = _mm_movehl_ps(imp0, imp0);
113 vals = _mm_add_ps(imp0, vals);
114 _mm_storel_pi((__m64*)&Values[o1][0], vals);
116 else
118 for(i = 0;i < IrSize;i += 2)
120 const ALuint o = (Offset + i)&HRIR_MASK;
122 coeffs = _mm_load_ps(&Coeffs[i][0]);
123 vals = _mm_load_ps(&Values[o][0]);
124 vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs));
125 _mm_store_ps(&Values[o][0], vals);
131 void MixDirect_SSE(ALsource *Source, ALCdevice *Device, DirectParams *params,
132 const ALfloat *RESTRICT data, ALuint srcchan,
133 ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize)
135 ALfloat (*RESTRICT DryBuffer)[MaxChannels];
136 ALfloat *RESTRICT ClickRemoval, *RESTRICT PendingClicks;
137 ALfloat DrySend[MaxChannels];
138 ALIGN(16) ALfloat value[4];
139 ALuint pos;
140 ALuint c;
141 (void)Source;
143 DryBuffer = Device->DryBuffer;
144 ClickRemoval = Device->ClickRemoval;
145 PendingClicks = Device->PendingClicks;
147 for(c = 0;c < MaxChannels;c++)
148 DrySend[c] = params->Gains[srcchan][c];
150 pos = 0;
151 if(OutPos == 0)
153 for(c = 0;c < MaxChannels;c++)
154 ClickRemoval[c] -= data[pos]*DrySend[c];
156 for(pos = 0;pos < BufferSize-3;pos += 4)
158 const __m128 val4 = _mm_load_ps(&data[pos]);
159 for(c = 0;c < MaxChannels;c++)
161 const __m128 gain = _mm_set1_ps(DrySend[c]);
162 __m128 dry4;
164 value[0] = DryBuffer[OutPos ][c];
165 value[1] = DryBuffer[OutPos+1][c];
166 value[2] = DryBuffer[OutPos+2][c];
167 value[3] = DryBuffer[OutPos+3][c];
168 dry4 = _mm_load_ps(value);
170 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain));
172 _mm_store_ps(value, dry4);
173 DryBuffer[OutPos ][c] = value[0];
174 DryBuffer[OutPos+1][c] = value[1];
175 DryBuffer[OutPos+2][c] = value[2];
176 DryBuffer[OutPos+3][c] = value[3];
179 OutPos += 4;
181 for(;pos < BufferSize;pos++)
183 for(c = 0;c < MaxChannels;c++)
184 DryBuffer[OutPos][c] += data[pos]*DrySend[c];
185 OutPos++;
187 if(OutPos == SamplesToDo)
189 for(c = 0;c < MaxChannels;c++)
190 PendingClicks[c] += data[pos]*DrySend[c];
193 #define NO_MIXDIRECT
196 #define SUFFIX SSE
197 #include "mixer_inc.c"
198 #undef SUFFIX