Use 2-channel UHJ for stereo output
[openal-soft.git] / Alc / mixer_sse.c
blob942e0453c6497a2c2e27af802985cbca35742b60
1 #include "config.h"
3 #include <xmmintrin.h>
5 #include "AL/al.h"
6 #include "AL/alc.h"
7 #include "alMain.h"
8 #include "alu.h"
10 #include "alSource.h"
11 #include "alAuxEffectSlot.h"
12 #include "mixer_defs.h"
15 const ALfloat *Resample_bsinc32_SSE(const BsincState *state, const ALfloat *src, ALuint frac,
16 ALuint increment, ALfloat *restrict dst, ALuint dstlen)
18 const __m128 sf4 = _mm_set1_ps(state->sf);
19 const ALuint m = state->m;
20 const ALint l = state->l;
21 const ALfloat *fil, *scd, *phd, *spd;
22 ALuint pi, j_f, i;
23 ALfloat pf;
24 ALint j_s;
25 __m128 r4;
27 for(i = 0;i < dstlen;i++)
29 // Calculate the phase index and factor.
30 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
31 pi = frac >> FRAC_PHASE_BITDIFF;
32 pf = (frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF));
33 #undef FRAC_PHASE_BITDIFF
35 fil = state->coeffs[pi].filter;
36 scd = state->coeffs[pi].scDelta;
37 phd = state->coeffs[pi].phDelta;
38 spd = state->coeffs[pi].spDelta;
40 // Apply the scale and phase interpolated filter.
41 r4 = _mm_setzero_ps();
43 const __m128 pf4 = _mm_set1_ps(pf);
44 for(j_f = 0,j_s = l;j_f < m;j_f+=4,j_s+=4)
46 const __m128 f4 = _mm_add_ps(
47 _mm_add_ps(
48 _mm_load_ps(&fil[j_f]),
49 _mm_mul_ps(sf4, _mm_load_ps(&scd[j_f]))
51 _mm_mul_ps(
52 pf4,
53 _mm_add_ps(
54 _mm_load_ps(&phd[j_f]),
55 _mm_mul_ps(sf4, _mm_load_ps(&spd[j_f]))
59 r4 = _mm_add_ps(r4, _mm_mul_ps(f4, _mm_loadu_ps(&src[j_s])));
62 r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
63 r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
64 dst[i] = _mm_cvtss_f32(r4);
66 frac += increment;
67 src += frac>>FRACTIONBITS;
68 frac &= FRACTIONMASK;
70 return dst;
74 static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
75 const ALuint IrSize,
76 ALfloat (*restrict Coeffs)[2],
77 const ALfloat (*restrict CoeffStep)[2],
78 ALfloat left, ALfloat right)
80 const __m128 lrlr = _mm_setr_ps(left, right, left, right);
81 __m128 coeffs, deltas, imp0, imp1;
82 __m128 vals = _mm_setzero_ps();
83 ALuint i;
85 if((Offset&1))
87 const ALuint o0 = Offset&HRIR_MASK;
88 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
90 coeffs = _mm_load_ps(&Coeffs[0][0]);
91 deltas = _mm_load_ps(&CoeffStep[0][0]);
92 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
93 imp0 = _mm_mul_ps(lrlr, coeffs);
94 coeffs = _mm_add_ps(coeffs, deltas);
95 vals = _mm_add_ps(imp0, vals);
96 _mm_store_ps(&Coeffs[0][0], coeffs);
97 _mm_storel_pi((__m64*)&Values[o0][0], vals);
98 for(i = 1;i < IrSize-1;i += 2)
100 const ALuint o2 = (Offset+i)&HRIR_MASK;
102 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
103 deltas = _mm_load_ps(&CoeffStep[i+1][0]);
104 vals = _mm_load_ps(&Values[o2][0]);
105 imp1 = _mm_mul_ps(lrlr, coeffs);
106 coeffs = _mm_add_ps(coeffs, deltas);
107 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
108 vals = _mm_add_ps(imp0, vals);
109 _mm_store_ps(&Coeffs[i+1][0], coeffs);
110 _mm_store_ps(&Values[o2][0], vals);
111 imp0 = imp1;
113 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
114 imp0 = _mm_movehl_ps(imp0, imp0);
115 vals = _mm_add_ps(imp0, vals);
116 _mm_storel_pi((__m64*)&Values[o1][0], vals);
118 else
120 for(i = 0;i < IrSize;i += 2)
122 const ALuint o = (Offset + i)&HRIR_MASK;
124 coeffs = _mm_load_ps(&Coeffs[i][0]);
125 deltas = _mm_load_ps(&CoeffStep[i][0]);
126 vals = _mm_load_ps(&Values[o][0]);
127 imp0 = _mm_mul_ps(lrlr, coeffs);
128 coeffs = _mm_add_ps(coeffs, deltas);
129 vals = _mm_add_ps(imp0, vals);
130 _mm_store_ps(&Coeffs[i][0], coeffs);
131 _mm_store_ps(&Values[o][0], vals);
136 static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
137 const ALuint IrSize,
138 ALfloat (*restrict Coeffs)[2],
139 ALfloat left, ALfloat right)
141 const __m128 lrlr = _mm_setr_ps(left, right, left, right);
142 __m128 vals = _mm_setzero_ps();
143 __m128 coeffs;
144 ALuint i;
146 if((Offset&1))
148 const ALuint o0 = Offset&HRIR_MASK;
149 const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
150 __m128 imp0, imp1;
152 coeffs = _mm_load_ps(&Coeffs[0][0]);
153 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
154 imp0 = _mm_mul_ps(lrlr, coeffs);
155 vals = _mm_add_ps(imp0, vals);
156 _mm_storel_pi((__m64*)&Values[o0][0], vals);
157 for(i = 1;i < IrSize-1;i += 2)
159 const ALuint o2 = (Offset+i)&HRIR_MASK;
161 coeffs = _mm_load_ps(&Coeffs[i+1][0]);
162 vals = _mm_load_ps(&Values[o2][0]);
163 imp1 = _mm_mul_ps(lrlr, coeffs);
164 imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
165 vals = _mm_add_ps(imp0, vals);
166 _mm_store_ps(&Values[o2][0], vals);
167 imp0 = imp1;
169 vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
170 imp0 = _mm_movehl_ps(imp0, imp0);
171 vals = _mm_add_ps(imp0, vals);
172 _mm_storel_pi((__m64*)&Values[o1][0], vals);
174 else
176 for(i = 0;i < IrSize;i += 2)
178 const ALuint o = (Offset + i)&HRIR_MASK;
180 coeffs = _mm_load_ps(&Coeffs[i][0]);
181 vals = _mm_load_ps(&Values[o][0]);
182 vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs));
183 _mm_store_ps(&Values[o][0], vals);
188 #define MixHrtf MixHrtf_SSE
189 #include "mixer_inc.c"
190 #undef MixHrtf
193 void Mix_SSE(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE],
194 MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize)
196 ALfloat gain, step;
197 __m128 gain4;
198 ALuint c;
200 for(c = 0;c < OutChans;c++)
202 ALuint pos = 0;
203 gain = Gains[c].Current;
204 step = Gains[c].Step;
205 if(step != 0.0f && Counter > 0)
207 ALuint minsize = minu(BufferSize, Counter);
208 /* Mix with applying gain steps in aligned multiples of 4. */
209 if(minsize-pos > 3)
211 __m128 step4;
212 gain4 = _mm_setr_ps(
213 gain,
214 gain + step,
215 gain + step + step,
216 gain + step + step + step
218 step4 = _mm_set1_ps(step + step + step + step);
219 do {
220 const __m128 val4 = _mm_load_ps(&data[pos]);
221 __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
222 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
223 gain4 = _mm_add_ps(gain4, step4);
224 _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
225 pos += 4;
226 } while(minsize-pos > 3);
227 /* NOTE: gain4 now represents the next four gains after the
228 * last four mixed samples, so the lowest element represents
229 * the next gain to apply.
231 gain = _mm_cvtss_f32(gain4);
233 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
234 for(;pos < minsize;pos++)
236 OutBuffer[c][OutPos+pos] += data[pos]*gain;
237 gain += step;
239 if(pos == Counter)
240 gain = Gains[c].Target;
241 Gains[c].Current = gain;
243 /* Mix until pos is aligned with 4 or the mix is done. */
244 minsize = minu(BufferSize, (pos+3)&~3);
245 for(;pos < minsize;pos++)
246 OutBuffer[c][OutPos+pos] += data[pos]*gain;
249 if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
250 continue;
251 gain4 = _mm_set1_ps(gain);
252 for(;BufferSize-pos > 3;pos += 4)
254 const __m128 val4 = _mm_load_ps(&data[pos]);
255 __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
256 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
257 _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
259 for(;pos < BufferSize;pos++)
260 OutBuffer[c][OutPos+pos] += data[pos]*gain;