Update HRTF code
[openal-soft/openal-hmr.git] / Alc / mixer_sse.c
blob27b550e09c4e5decb403aac3d8dc3483f9b4c0aa
1 #include "config.h"
3 #ifdef HAVE_XMMINTRIN_H
4 #include <xmmintrin.h>
5 #endif
7 #include "AL/al.h"
8 #include "AL/alc.h"
9 #include "alMain.h"
10 #include "alu.h"
12 #include "alSource.h"
13 #include "mixer_defs.h"
16 static __inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*RESTRICT Values)[2],
17 const ALuint IrSize,
18 ALfloat (*RESTRICT Coeffs)[2],
19 ALfloat (*RESTRICT CoeffStep)[2],
20 ALfloat left, ALfloat right)
22 const __m128 lrlr = { left, right, left, right };
23 __m128 coeffs, deltas, imp0, imp1;
24 __m128 vals = _mm_setzero_ps();
25 ALuint i;
27 if((Offset&1))
29 const ALuint o0 = Offset&HRIR_MASK;
30 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
32 coeffs = _mm_load_ps(&Coeffs[0][0]);
33 deltas = _mm_load_ps(&CoeffStep[0][0]);
34 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
35 imp0 = _mm_mul_ps(lrlr, coeffs);
36 coeffs = _mm_add_ps(coeffs, deltas);
37 vals = _mm_add_ps(imp0, vals);
38 _mm_store_ps(&Coeffs[0][0], coeffs);
39 _mm_storel_pi((__m64*)&Values[o0][0], vals);
40 for(i = 1;i < IrSize-1;i += 2)
42 const ALuint o2 = (Offset+i)&HRIR_MASK;
44 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
45 deltas = _mm_load_ps(&CoeffStep[i+1][0]);
46 vals = _mm_load_ps(&Values[o2][0]);
47 imp1 = _mm_mul_ps(lrlr, coeffs);
48 coeffs = _mm_add_ps(coeffs, deltas);
49 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
50 vals = _mm_add_ps(imp0, vals);
51 _mm_store_ps(&Coeffs[i+1][0], coeffs);
52 _mm_store_ps(&Values[o2][0], vals);
53 imp0 = imp1;
55 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
56 imp0 = _mm_movehl_ps(imp0, imp0);
57 vals = _mm_add_ps(imp0, vals);
58 _mm_storel_pi((__m64*)&Values[o1][0], vals);
60 else
62 for(i = 0;i < IrSize;i += 2)
64 const ALuint o = (Offset + i)&HRIR_MASK;
66 coeffs = _mm_load_ps(&Coeffs[i][0]);
67 deltas = _mm_load_ps(&CoeffStep[i][0]);
68 vals = _mm_load_ps(&Values[o][0]);
69 imp0 = _mm_mul_ps(lrlr, coeffs);
70 coeffs = _mm_add_ps(coeffs, deltas);
71 vals = _mm_add_ps(imp0, vals);
72 _mm_store_ps(&Coeffs[i][0], coeffs);
73 _mm_store_ps(&Values[o][0], vals);
78 static __inline void ApplyCoeffs(ALuint Offset, ALfloat (*RESTRICT Values)[2],
79 const ALuint IrSize,
80 ALfloat (*RESTRICT Coeffs)[2],
81 ALfloat left, ALfloat right)
83 const __m128 lrlr = { left, right, left, right };
84 __m128 vals = _mm_setzero_ps();
85 __m128 coeffs;
86 ALuint i;
88 if((Offset&1))
90 const ALuint o0 = Offset&HRIR_MASK;
91 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
92 __m128 imp0, imp1;
94 coeffs = _mm_load_ps(&Coeffs[0][0]);
95 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
96 imp0 = _mm_mul_ps(lrlr, coeffs);
97 vals = _mm_add_ps(imp0, vals);
98 _mm_storel_pi((__m64*)&Values[o0][0], vals);
99 for(i = 1;i < IrSize-1;i += 2)
101 const ALuint o2 = (Offset+i)&HRIR_MASK;
103 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
104 vals = _mm_load_ps(&Values[o2][0]);
105 imp1 = _mm_mul_ps(lrlr, coeffs);
106 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
107 vals = _mm_add_ps(imp0, vals);
108 _mm_store_ps(&Values[o2][0], vals);
109 imp0 = imp1;
111 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
112 imp0 = _mm_movehl_ps(imp0, imp0);
113 vals = _mm_add_ps(imp0, vals);
114 _mm_storel_pi((__m64*)&Values[o1][0], vals);
116 else
118 for(i = 0;i < IrSize;i += 2)
120 const ALuint o = (Offset + i)&HRIR_MASK;
122 coeffs = _mm_load_ps(&Coeffs[i][0]);
123 vals = _mm_load_ps(&Values[o][0]);
124 vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs));
125 _mm_store_ps(&Values[o][0], vals);
131 void MixDirect_SSE(ALsource *Source, ALCdevice *Device, DirectParams *params,
132 const ALfloat *RESTRICT data, ALuint srcchan,
133 ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize)
135 ALfloat (*RESTRICT DryBuffer)[MaxChannels];
136 ALfloat *RESTRICT ClickRemoval, *RESTRICT PendingClicks;
137 ALIGN(16) ALfloat DrySend[MaxChannels];
138 ALIGN(16) ALfloat value[4];
139 FILTER *DryFilter;
140 ALuint pos;
141 ALuint c;
142 (void)Source;
144 DryBuffer = Device->DryBuffer;
145 ClickRemoval = Device->ClickRemoval;
146 PendingClicks = Device->PendingClicks;
147 DryFilter = &params->iirFilter;
149 for(c = 0;c < MaxChannels;c++)
150 DrySend[c] = params->Gains[srcchan][c];
152 pos = 0;
153 if(OutPos == 0)
155 value[0] = lpFilter2PC(DryFilter, srcchan, data[pos]);
156 for(c = 0;c < MaxChannels;c++)
157 ClickRemoval[c] -= value[0]*DrySend[c];
159 for(pos = 0;pos < BufferSize-3;pos += 4)
161 __m128 val4;
163 value[0] = lpFilter2P(DryFilter, srcchan, data[pos ]);
164 value[1] = lpFilter2P(DryFilter, srcchan, data[pos+1]);
165 value[2] = lpFilter2P(DryFilter, srcchan, data[pos+2]);
166 value[3] = lpFilter2P(DryFilter, srcchan, data[pos+3]);
167 val4 = _mm_load_ps(value);
169 for(c = 0;c < MaxChannels;c++)
171 const __m128 gain = _mm_set1_ps(DrySend[c]);
172 __m128 dry4;
174 value[0] = DryBuffer[OutPos ][c];
175 value[1] = DryBuffer[OutPos+1][c];
176 value[2] = DryBuffer[OutPos+2][c];
177 value[3] = DryBuffer[OutPos+3][c];
178 dry4 = _mm_load_ps(value);
180 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain));
182 _mm_store_ps(value, dry4);
183 DryBuffer[OutPos ][c] = value[0];
184 DryBuffer[OutPos+1][c] = value[1];
185 DryBuffer[OutPos+2][c] = value[2];
186 DryBuffer[OutPos+3][c] = value[3];
189 OutPos += 4;
191 for(;pos < BufferSize;pos++)
193 value[0] = lpFilter2P(DryFilter, srcchan, data[pos]);
194 for(c = 0;c < MaxChannels;c++)
195 DryBuffer[OutPos][c] += value[0]*DrySend[c];
196 OutPos++;
198 if(OutPos == SamplesToDo)
200 value[0] = lpFilter2PC(DryFilter, srcchan, data[pos]);
201 for(c = 0;c < MaxChannels;c++)
202 PendingClicks[c] += value[0]*DrySend[c];
205 #define NO_MIXDIRECT
208 #define SUFFIX SSE
209 #include "mixer_inc.c"
210 #undef SUFFIX