Make some pointer-to-array parameters const
[openal-soft.git] / Alc / mixer_sse.c
blob24e0e545e10fa425bb7fc7e98ceb84036abf7bbc
1 #include "config.h"
3 #include <xmmintrin.h>
5 #include "AL/al.h"
6 #include "AL/alc.h"
7 #include "alMain.h"
8 #include "alu.h"
10 #include "alSource.h"
11 #include "alAuxEffectSlot.h"
12 #include "mixer_defs.h"
15 const ALfloat *Resample_bsinc32_SSE(const BsincState *state, const ALfloat *src, ALuint frac,
16 ALuint increment, ALfloat *restrict dst, ALuint dstlen)
18 const __m128 sf4 = _mm_set1_ps(state->sf);
19 const ALuint m = state->m;
20 const ALint l = state->l;
21 const ALfloat *fil, *scd, *phd, *spd;
22 ALuint pi, j_f, i;
23 ALfloat pf;
24 ALint j_s;
25 __m128 r4;
27 for(i = 0;i < dstlen;i++)
29 // Calculate the phase index and factor.
30 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
31 pi = frac >> FRAC_PHASE_BITDIFF;
32 pf = (frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF));
33 #undef FRAC_PHASE_BITDIFF
35 fil = state->coeffs[pi].filter;
36 scd = state->coeffs[pi].scDelta;
37 phd = state->coeffs[pi].phDelta;
38 spd = state->coeffs[pi].spDelta;
40 // Apply the scale and phase interpolated filter.
41 r4 = _mm_setzero_ps();
43 const __m128 pf4 = _mm_set1_ps(pf);
44 for(j_f = 0,j_s = l;j_f < m;j_f+=4,j_s+=4)
46 const __m128 f4 = _mm_add_ps(
47 _mm_add_ps(
48 _mm_load_ps(&fil[j_f]),
49 _mm_mul_ps(sf4, _mm_load_ps(&scd[j_f]))
51 _mm_mul_ps(
52 pf4,
53 _mm_add_ps(
54 _mm_load_ps(&phd[j_f]),
55 _mm_mul_ps(sf4, _mm_load_ps(&spd[j_f]))
59 r4 = _mm_add_ps(r4, _mm_mul_ps(f4, _mm_loadu_ps(&src[j_s])));
62 r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
63 r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
64 dst[i] = _mm_cvtss_f32(r4);
66 frac += increment;
67 src += frac>>FRACTIONBITS;
68 frac &= FRACTIONMASK;
70 return dst;
74 static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
75 const ALuint IrSize,
76 ALfloat (*restrict Coeffs)[2],
77 const ALfloat (*restrict CoeffStep)[2],
78 ALfloat left, ALfloat right)
80 const __m128 lrlr = _mm_setr_ps(left, right, left, right);
81 __m128 coeffs, deltas, imp0, imp1;
82 __m128 vals = _mm_setzero_ps();
83 ALuint i;
85 if((Offset&1))
87 const ALuint o0 = Offset&HRIR_MASK;
88 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
90 coeffs = _mm_load_ps(&Coeffs[0][0]);
91 deltas = _mm_load_ps(&CoeffStep[0][0]);
92 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
93 imp0 = _mm_mul_ps(lrlr, coeffs);
94 coeffs = _mm_add_ps(coeffs, deltas);
95 vals = _mm_add_ps(imp0, vals);
96 _mm_store_ps(&Coeffs[0][0], coeffs);
97 _mm_storel_pi((__m64*)&Values[o0][0], vals);
98 for(i = 1;i < IrSize-1;i += 2)
100 const ALuint o2 = (Offset+i)&HRIR_MASK;
102 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
103 deltas = _mm_load_ps(&CoeffStep[i+1][0]);
104 vals = _mm_load_ps(&Values[o2][0]);
105 imp1 = _mm_mul_ps(lrlr, coeffs);
106 coeffs = _mm_add_ps(coeffs, deltas);
107 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
108 vals = _mm_add_ps(imp0, vals);
109 _mm_store_ps(&Coeffs[i+1][0], coeffs);
110 _mm_store_ps(&Values[o2][0], vals);
111 imp0 = imp1;
113 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
114 imp0 = _mm_movehl_ps(imp0, imp0);
115 vals = _mm_add_ps(imp0, vals);
116 _mm_storel_pi((__m64*)&Values[o1][0], vals);
118 else
120 for(i = 0;i < IrSize;i += 2)
122 const ALuint o = (Offset + i)&HRIR_MASK;
124 coeffs = _mm_load_ps(&Coeffs[i][0]);
125 deltas = _mm_load_ps(&CoeffStep[i][0]);
126 vals = _mm_load_ps(&Values[o][0]);
127 imp0 = _mm_mul_ps(lrlr, coeffs);
128 coeffs = _mm_add_ps(coeffs, deltas);
129 vals = _mm_add_ps(imp0, vals);
130 _mm_store_ps(&Coeffs[i][0], coeffs);
131 _mm_store_ps(&Values[o][0], vals);
136 static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
137 const ALuint IrSize,
138 ALfloat (*restrict Coeffs)[2],
139 ALfloat left, ALfloat right)
141 const __m128 lrlr = _mm_setr_ps(left, right, left, right);
142 __m128 vals = _mm_setzero_ps();
143 __m128 coeffs;
144 ALuint i;
146 if((Offset&1))
148 const ALuint o0 = Offset&HRIR_MASK;
149 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
150 __m128 imp0, imp1;
152 coeffs = _mm_load_ps(&Coeffs[0][0]);
153 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
154 imp0 = _mm_mul_ps(lrlr, coeffs);
155 vals = _mm_add_ps(imp0, vals);
156 _mm_storel_pi((__m64*)&Values[o0][0], vals);
157 for(i = 1;i < IrSize-1;i += 2)
159 const ALuint o2 = (Offset+i)&HRIR_MASK;
161 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
162 vals = _mm_load_ps(&Values[o2][0]);
163 imp1 = _mm_mul_ps(lrlr, coeffs);
164 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
165 vals = _mm_add_ps(imp0, vals);
166 _mm_store_ps(&Values[o2][0], vals);
167 imp0 = imp1;
169 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
170 imp0 = _mm_movehl_ps(imp0, imp0);
171 vals = _mm_add_ps(imp0, vals);
172 _mm_storel_pi((__m64*)&Values[o1][0], vals);
174 else
176 for(i = 0;i < IrSize;i += 2)
178 const ALuint o = (Offset + i)&HRIR_MASK;
180 coeffs = _mm_load_ps(&Coeffs[i][0]);
181 vals = _mm_load_ps(&Values[o][0]);
182 vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs));
183 _mm_store_ps(&Values[o][0], vals);
188 #define MixHrtf MixHrtf_SSE
189 #define MixDirectHrtf MixDirectHrtf_SSE
190 #include "mixer_inc.c"
191 #undef MixHrtf
194 void Mix_SSE(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE],
195 MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize)
197 ALfloat gain, step;
198 __m128 gain4;
199 ALuint c;
201 for(c = 0;c < OutChans;c++)
203 ALuint pos = 0;
204 gain = Gains[c].Current;
205 step = Gains[c].Step;
206 if(step != 0.0f && Counter > 0)
208 ALuint minsize = minu(BufferSize, Counter);
209 /* Mix with applying gain steps in aligned multiples of 4. */
210 if(minsize-pos > 3)
212 __m128 step4;
213 gain4 = _mm_setr_ps(
214 gain,
215 gain + step,
216 gain + step + step,
217 gain + step + step + step
219 step4 = _mm_set1_ps(step + step + step + step);
220 do {
221 const __m128 val4 = _mm_load_ps(&data[pos]);
222 __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
223 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
224 gain4 = _mm_add_ps(gain4, step4);
225 _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
226 pos += 4;
227 } while(minsize-pos > 3);
228 /* NOTE: gain4 now represents the next four gains after the
229 * last four mixed samples, so the lowest element represents
230 * the next gain to apply.
232 gain = _mm_cvtss_f32(gain4);
234 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
235 for(;pos < minsize;pos++)
237 OutBuffer[c][OutPos+pos] += data[pos]*gain;
238 gain += step;
240 if(pos == Counter)
241 gain = Gains[c].Target;
242 Gains[c].Current = gain;
244 /* Mix until pos is aligned with 4 or the mix is done. */
245 minsize = minu(BufferSize, (pos+3)&~3);
246 for(;pos < minsize;pos++)
247 OutBuffer[c][OutPos+pos] += data[pos]*gain;
250 if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
251 continue;
252 gain4 = _mm_set1_ps(gain);
253 for(;BufferSize-pos > 3;pos += 4)
255 const __m128 val4 = _mm_load_ps(&data[pos]);
256 __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
257 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
258 _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
260 for(;pos < BufferSize;pos++)
261 OutBuffer[c][OutPos+pos] += data[pos]*gain;
265 void MixRow_SSE(ALfloat *OutBuffer, const ALfloat *Gains, const ALfloat (*restrict data)[BUFFERSIZE], ALuint InChans, ALuint InPos, ALuint BufferSize)
267 __m128 gain4;
268 ALuint c;
270 for(c = 0;c < InChans;c++)
272 ALuint pos = 0;
273 ALfloat gain = Gains[c];
274 if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
275 continue;
277 gain4 = _mm_set1_ps(gain);
278 for(;BufferSize-pos > 3;pos += 4)
280 const __m128 val4 = _mm_load_ps(&data[c][InPos+pos]);
281 __m128 dry4 = _mm_load_ps(&OutBuffer[pos]);
282 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
283 _mm_store_ps(&OutBuffer[pos], dry4);
285 for(;pos < BufferSize;pos++)
286 OutBuffer[pos] += data[c][InPos+pos]*gain;