Shorten a warning
[openal-soft.git] / Alc / mixer_sse.c
blobaff0152b736c196346b570060d3341cd65a9b0e7
1 #include "config.h"
3 #ifdef HAVE_XMMINTRIN_H
4 #include <xmmintrin.h>
5 #endif
7 #include "AL/al.h"
8 #include "AL/alc.h"
9 #include "alMain.h"
10 #include "alu.h"
12 #include "alSource.h"
13 #include "mixer_defs.h"
15 static __inline ALfloat lerp32(const ALfloat *vals, ALint step, ALuint frac)
16 { return lerp(vals[0], vals[step], frac * (1.0f/FRACTIONONE)); }
18 void Resample_lerp32_SSE(const ALfloat *data, ALuint frac,
19 ALuint increment, ALuint NumChannels, ALfloat *RESTRICT OutBuffer,
20 ALuint BufferSize)
22 ALIGN(16) float value[3][4];
23 ALuint pos = 0;
24 ALuint i, j;
26 for(i = 0;i < BufferSize+1-3;i+=4)
28 __m128 x, y, a;
29 for(j = 0;j < 4;j++)
31 value[0][j] = data[(pos )*NumChannels];
32 value[1][j] = data[(pos+1)*NumChannels];
33 value[2][j] = frac * (1.0f/FRACTIONONE);
35 frac += increment;
36 pos += frac>>FRACTIONBITS;
37 frac &= FRACTIONMASK;
40 x = _mm_load_ps(value[0]);
41 y = _mm_load_ps(value[1]);
42 y = _mm_sub_ps(y, x);
44 a = _mm_load_ps(value[2]);
45 y = _mm_mul_ps(y, a);
47 x = _mm_add_ps(x, y);
49 _mm_store_ps(&OutBuffer[i], x);
51 for(;i < BufferSize+1;i++)
53 OutBuffer[i] = lerp32(data + pos*NumChannels, NumChannels, frac);
55 frac += increment;
56 pos += frac>>FRACTIONBITS;
57 frac &= FRACTIONMASK;
63 static __inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*RESTRICT Values)[2],
64 const ALuint IrSize,
65 ALfloat (*RESTRICT Coeffs)[2],
66 ALfloat (*RESTRICT CoeffStep)[2],
67 ALfloat left, ALfloat right)
69 const __m128 lrlr = { left, right, left, right };
70 __m128 coeffs, deltas, imp0, imp1;
71 __m128 vals = _mm_setzero_ps();
72 ALuint i;
74 if((Offset&1))
76 const ALuint o0 = Offset&HRIR_MASK;
77 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
79 coeffs = _mm_load_ps(&Coeffs[0][0]);
80 deltas = _mm_load_ps(&CoeffStep[0][0]);
81 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
82 imp0 = _mm_mul_ps(lrlr, coeffs);
83 coeffs = _mm_add_ps(coeffs, deltas);
84 vals = _mm_add_ps(imp0, vals);
85 _mm_store_ps(&Coeffs[0][0], coeffs);
86 _mm_storel_pi((__m64*)&Values[o0][0], vals);
87 for(i = 1;i < IrSize-1;i += 2)
89 const ALuint o2 = (Offset+i)&HRIR_MASK;
91 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
92 deltas = _mm_load_ps(&CoeffStep[i+1][0]);
93 vals = _mm_load_ps(&Values[o2][0]);
94 imp1 = _mm_mul_ps(lrlr, coeffs);
95 coeffs = _mm_add_ps(coeffs, deltas);
96 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
97 vals = _mm_add_ps(imp0, vals);
98 _mm_store_ps(&Coeffs[i+1][0], coeffs);
99 _mm_store_ps(&Values[o2][0], vals);
100 imp0 = imp1;
102 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
103 imp0 = _mm_movehl_ps(imp0, imp0);
104 vals = _mm_add_ps(imp0, vals);
105 _mm_storel_pi((__m64*)&Values[o1][0], vals);
107 else
109 for(i = 0;i < IrSize;i += 2)
111 const ALuint o = (Offset + i)&HRIR_MASK;
113 coeffs = _mm_load_ps(&Coeffs[i][0]);
114 deltas = _mm_load_ps(&CoeffStep[i][0]);
115 vals = _mm_load_ps(&Values[o][0]);
116 imp0 = _mm_mul_ps(lrlr, coeffs);
117 coeffs = _mm_add_ps(coeffs, deltas);
118 vals = _mm_add_ps(imp0, vals);
119 _mm_store_ps(&Coeffs[i][0], coeffs);
120 _mm_store_ps(&Values[o][0], vals);
125 static __inline void ApplyCoeffs(ALuint Offset, ALfloat (*RESTRICT Values)[2],
126 const ALuint IrSize,
127 ALfloat (*RESTRICT Coeffs)[2],
128 ALfloat left, ALfloat right)
130 const __m128 lrlr = { left, right, left, right };
131 __m128 vals = _mm_setzero_ps();
132 __m128 coeffs;
133 ALuint i;
135 if((Offset&1))
137 const ALuint o0 = Offset&HRIR_MASK;
138 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
139 __m128 imp0, imp1;
141 coeffs = _mm_load_ps(&Coeffs[0][0]);
142 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
143 imp0 = _mm_mul_ps(lrlr, coeffs);
144 vals = _mm_add_ps(imp0, vals);
145 _mm_storel_pi((__m64*)&Values[o0][0], vals);
146 for(i = 1;i < IrSize-1;i += 2)
148 const ALuint o2 = (Offset+i)&HRIR_MASK;
150 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
151 vals = _mm_load_ps(&Values[o2][0]);
152 imp1 = _mm_mul_ps(lrlr, coeffs);
153 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
154 vals = _mm_add_ps(imp0, vals);
155 _mm_store_ps(&Values[o2][0], vals);
156 imp0 = imp1;
158 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
159 imp0 = _mm_movehl_ps(imp0, imp0);
160 vals = _mm_add_ps(imp0, vals);
161 _mm_storel_pi((__m64*)&Values[o1][0], vals);
163 else
165 for(i = 0;i < IrSize;i += 2)
167 const ALuint o = (Offset + i)&HRIR_MASK;
169 coeffs = _mm_load_ps(&Coeffs[i][0]);
170 vals = _mm_load_ps(&Values[o][0]);
171 vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs));
172 _mm_store_ps(&Values[o][0], vals);
178 void MixDirect_SSE(ALsource *Source, ALCdevice *Device, DirectParams *params,
179 const ALfloat *RESTRICT data, ALuint srcchan,
180 ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize)
182 ALfloat (*RESTRICT DryBuffer)[BUFFERSIZE] = Device->DryBuffer;
183 ALfloat *RESTRICT ClickRemoval = Device->ClickRemoval;
184 ALfloat *RESTRICT PendingClicks = Device->PendingClicks;
185 ALfloat DrySend[MaxChannels];
186 ALuint pos;
187 ALuint c;
188 (void)Source;
190 for(c = 0;c < MaxChannels;c++)
191 DrySend[c] = params->Gains[srcchan][c];
193 pos = 0;
194 if(OutPos == 0)
196 for(c = 0;c < MaxChannels;c++)
197 ClickRemoval[c] -= data[pos]*DrySend[c];
199 for(c = 0;c < MaxChannels;c++)
201 const __m128 gain = _mm_set1_ps(DrySend[c]);
202 for(pos = 0;pos < BufferSize-3;pos += 4)
204 const __m128 val4 = _mm_load_ps(&data[pos]);
205 __m128 dry4 = _mm_load_ps(&DryBuffer[c][OutPos+pos]);
206 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain));
207 _mm_store_ps(&DryBuffer[c][OutPos+pos], dry4);
210 if(pos < BufferSize)
212 ALuint oldpos = pos;
213 for(c = 0;c < MaxChannels;c++)
215 pos = oldpos;
216 for(;pos < BufferSize;pos++)
217 DryBuffer[c][OutPos+pos] += data[pos]*DrySend[c];
220 if(OutPos+pos == SamplesToDo)
222 for(c = 0;c < MaxChannels;c++)
223 PendingClicks[c] += data[pos]*DrySend[c];
226 #define NO_MIXDIRECT
229 #define SUFFIX SSE
230 #include "mixer_inc.c"
231 #undef SUFFIX