Minor cleanups for variable declarations
[openal-soft/openal-hmr.git] / Alc / mixer_sse.c
blob792fead6060649e8d4555b152ecda824da8579b0
1 #include "config.h"
3 #ifdef HAVE_XMMINTRIN_H
4 #include <xmmintrin.h>
5 #endif
7 #include "AL/al.h"
8 #include "AL/alc.h"
9 #include "alMain.h"
10 #include "alu.h"
12 #include "alSource.h"
13 #include "mixer_defs.h"
16 static __inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*RESTRICT Values)[2],
17 const ALuint IrSize,
18 ALfloat (*RESTRICT Coeffs)[2],
19 ALfloat (*RESTRICT CoeffStep)[2],
20 ALfloat left, ALfloat right)
22 const __m128 lrlr = { left, right, left, right };
23 __m128 coeffs, deltas, imp0, imp1;
24 __m128 vals = _mm_setzero_ps();
25 ALuint i;
27 if((Offset&1))
29 const ALuint o0 = Offset&HRIR_MASK;
30 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
32 coeffs = _mm_load_ps(&Coeffs[0][0]);
33 deltas = _mm_load_ps(&CoeffStep[0][0]);
34 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
35 imp0 = _mm_mul_ps(lrlr, coeffs);
36 coeffs = _mm_add_ps(coeffs, deltas);
37 vals = _mm_add_ps(imp0, vals);
38 _mm_store_ps(&Coeffs[0][0], coeffs);
39 _mm_storel_pi((__m64*)&Values[o0][0], vals);
40 for(i = 1;i < IrSize-1;i += 2)
42 const ALuint o2 = (Offset+i)&HRIR_MASK;
44 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
45 deltas = _mm_load_ps(&CoeffStep[i+1][0]);
46 vals = _mm_load_ps(&Values[o2][0]);
47 imp1 = _mm_mul_ps(lrlr, coeffs);
48 coeffs = _mm_add_ps(coeffs, deltas);
49 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
50 vals = _mm_add_ps(imp0, vals);
51 _mm_store_ps(&Coeffs[i+1][0], coeffs);
52 _mm_store_ps(&Values[o2][0], vals);
53 imp0 = imp1;
55 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
56 imp0 = _mm_movehl_ps(imp0, imp0);
57 vals = _mm_add_ps(imp0, vals);
58 _mm_storel_pi((__m64*)&Values[o1][0], vals);
60 else
62 for(i = 0;i < IrSize;i += 2)
64 const ALuint o = (Offset + i)&HRIR_MASK;
66 coeffs = _mm_load_ps(&Coeffs[i][0]);
67 deltas = _mm_load_ps(&CoeffStep[i][0]);
68 vals = _mm_load_ps(&Values[o][0]);
69 imp0 = _mm_mul_ps(lrlr, coeffs);
70 coeffs = _mm_add_ps(coeffs, deltas);
71 vals = _mm_add_ps(imp0, vals);
72 _mm_store_ps(&Coeffs[i][0], coeffs);
73 _mm_store_ps(&Values[o][0], vals);
78 static __inline void ApplyCoeffs(ALuint Offset, ALfloat (*RESTRICT Values)[2],
79 const ALuint IrSize,
80 ALfloat (*RESTRICT Coeffs)[2],
81 ALfloat left, ALfloat right)
83 const __m128 lrlr = { left, right, left, right };
84 __m128 vals = _mm_setzero_ps();
85 __m128 coeffs;
86 ALuint i;
88 if((Offset&1))
90 const ALuint o0 = Offset&HRIR_MASK;
91 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
92 __m128 imp0, imp1;
94 coeffs = _mm_load_ps(&Coeffs[0][0]);
95 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
96 imp0 = _mm_mul_ps(lrlr, coeffs);
97 vals = _mm_add_ps(imp0, vals);
98 _mm_storel_pi((__m64*)&Values[o0][0], vals);
99 for(i = 1;i < IrSize-1;i += 2)
101 const ALuint o2 = (Offset+i)&HRIR_MASK;
103 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
104 vals = _mm_load_ps(&Values[o2][0]);
105 imp1 = _mm_mul_ps(lrlr, coeffs);
106 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
107 vals = _mm_add_ps(imp0, vals);
108 _mm_store_ps(&Values[o2][0], vals);
109 imp0 = imp1;
111 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
112 imp0 = _mm_movehl_ps(imp0, imp0);
113 vals = _mm_add_ps(imp0, vals);
114 _mm_storel_pi((__m64*)&Values[o1][0], vals);
116 else
118 for(i = 0;i < IrSize;i += 2)
120 const ALuint o = (Offset + i)&HRIR_MASK;
122 coeffs = _mm_load_ps(&Coeffs[i][0]);
123 vals = _mm_load_ps(&Values[o][0]);
124 vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs));
125 _mm_store_ps(&Values[o][0], vals);
131 void MixDirect_SSE(ALsource *Source, ALCdevice *Device, DirectParams *params,
132 const ALfloat *RESTRICT data, ALuint srcchan,
133 ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize)
135 ALfloat (*RESTRICT DryBuffer)[BUFFERSIZE] = Device->DryBuffer;
136 ALfloat *RESTRICT ClickRemoval = Device->ClickRemoval;
137 ALfloat *RESTRICT PendingClicks = Device->PendingClicks;
138 ALfloat DrySend[MaxChannels];
139 ALuint pos;
140 ALuint c;
141 (void)Source;
143 for(c = 0;c < MaxChannels;c++)
144 DrySend[c] = params->Gains[srcchan][c];
146 pos = 0;
147 if(OutPos == 0)
149 for(c = 0;c < MaxChannels;c++)
150 ClickRemoval[c] -= data[pos]*DrySend[c];
152 for(c = 0;c < MaxChannels;c++)
154 const __m128 gain = _mm_set1_ps(DrySend[c]);
155 for(pos = 0;pos < BufferSize-3;pos += 4)
157 const __m128 val4 = _mm_load_ps(&data[pos]);
158 __m128 dry4 = _mm_load_ps(&DryBuffer[c][OutPos+pos]);
159 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain));
160 _mm_store_ps(&DryBuffer[c][OutPos+pos], dry4);
163 if(pos < BufferSize)
165 ALuint oldpos = pos;
166 for(c = 0;c < MaxChannels;c++)
168 pos = oldpos;
169 for(;pos < BufferSize;pos++)
170 DryBuffer[c][OutPos+pos] += data[pos]*DrySend[c];
173 if(OutPos+pos == SamplesToDo)
175 for(c = 0;c < MaxChannels;c++)
176 PendingClicks[c] += data[pos]*DrySend[c];
179 #define NO_MIXDIRECT
182 #define SUFFIX SSE
183 #include "mixer_inc.c"
184 #undef SUFFIX