11 #include "alAuxEffectSlot.h"
12 #include "mixer_defs.h"
15 const ALfloat
*Resample_bsinc32_SSE(const BsincState
*state
, const ALfloat
*src
, ALuint frac
,
16 ALuint increment
, ALfloat
*restrict dst
, ALuint dstlen
)
18 const __m128 sf4
= _mm_set1_ps(state
->sf
);
19 const ALuint m
= state
->m
;
20 const ALint l
= state
->l
;
21 const ALfloat
*fil
, *scd
, *phd
, *spd
;
27 for(i
= 0;i
< dstlen
;i
++)
29 // Calculate the phase index and factor.
30 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
31 pi
= frac
>> FRAC_PHASE_BITDIFF
;
32 pf
= (frac
& ((1<<FRAC_PHASE_BITDIFF
)-1)) * (1.0f
/(1<<FRAC_PHASE_BITDIFF
));
33 #undef FRAC_PHASE_BITDIFF
35 fil
= state
->coeffs
[pi
].filter
;
36 scd
= state
->coeffs
[pi
].scDelta
;
37 phd
= state
->coeffs
[pi
].phDelta
;
38 spd
= state
->coeffs
[pi
].spDelta
;
40 // Apply the scale and phase interpolated filter.
41 r4
= _mm_setzero_ps();
43 const __m128 pf4
= _mm_set1_ps(pf
);
44 for(j_f
= 0,j_s
= l
;j_f
< m
;j_f
+=4,j_s
+=4)
46 const __m128 f4
= _mm_add_ps(
48 _mm_load_ps(&fil
[j_f
]),
49 _mm_mul_ps(sf4
, _mm_load_ps(&scd
[j_f
]))
54 _mm_load_ps(&phd
[j_f
]),
55 _mm_mul_ps(sf4
, _mm_load_ps(&spd
[j_f
]))
59 r4
= _mm_add_ps(r4
, _mm_mul_ps(f4
, _mm_loadu_ps(&src
[j_s
])));
62 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
63 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
64 dst
[i
] = _mm_cvtss_f32(r4
);
67 src
+= frac
>>FRACTIONBITS
;
74 static inline void SetupCoeffs(ALfloat (*restrict OutCoeffs
)[2],
75 const HrtfParams
*hrtfparams
,
76 ALuint IrSize
, ALuint Counter
)
78 const __m128 counter4
= _mm_set1_ps((float)Counter
);
82 for(i
= 0;i
< IrSize
;i
+= 2)
84 step4
= _mm_load_ps(&hrtfparams
->CoeffStep
[i
][0]);
85 coeffs
= _mm_load_ps(&hrtfparams
->Coeffs
[i
][0]);
86 coeffs
= _mm_sub_ps(coeffs
, _mm_mul_ps(step4
, counter4
));
87 _mm_store_ps(&OutCoeffs
[i
][0], coeffs
);
91 static inline void ApplyCoeffsStep(ALuint Offset
, ALfloat (*restrict Values
)[2],
93 ALfloat (*restrict Coeffs
)[2],
94 const ALfloat (*restrict CoeffStep
)[2],
95 ALfloat left
, ALfloat right
)
97 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
98 __m128 coeffs
, deltas
, imp0
, imp1
;
99 __m128 vals
= _mm_setzero_ps();
104 const ALuint o0
= Offset
&HRIR_MASK
;
105 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
107 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
108 deltas
= _mm_load_ps(&CoeffStep
[0][0]);
109 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
110 imp0
= _mm_mul_ps(lrlr
, coeffs
);
111 coeffs
= _mm_add_ps(coeffs
, deltas
);
112 vals
= _mm_add_ps(imp0
, vals
);
113 _mm_store_ps(&Coeffs
[0][0], coeffs
);
114 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
115 for(i
= 1;i
< IrSize
-1;i
+= 2)
117 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
119 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
120 deltas
= _mm_load_ps(&CoeffStep
[i
+1][0]);
121 vals
= _mm_load_ps(&Values
[o2
][0]);
122 imp1
= _mm_mul_ps(lrlr
, coeffs
);
123 coeffs
= _mm_add_ps(coeffs
, deltas
);
124 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
125 vals
= _mm_add_ps(imp0
, vals
);
126 _mm_store_ps(&Coeffs
[i
+1][0], coeffs
);
127 _mm_store_ps(&Values
[o2
][0], vals
);
130 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
131 imp0
= _mm_movehl_ps(imp0
, imp0
);
132 vals
= _mm_add_ps(imp0
, vals
);
133 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
137 for(i
= 0;i
< IrSize
;i
+= 2)
139 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
141 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
142 deltas
= _mm_load_ps(&CoeffStep
[i
][0]);
143 vals
= _mm_load_ps(&Values
[o
][0]);
144 imp0
= _mm_mul_ps(lrlr
, coeffs
);
145 coeffs
= _mm_add_ps(coeffs
, deltas
);
146 vals
= _mm_add_ps(imp0
, vals
);
147 _mm_store_ps(&Coeffs
[i
][0], coeffs
);
148 _mm_store_ps(&Values
[o
][0], vals
);
153 static inline void ApplyCoeffs(ALuint Offset
, ALfloat (*restrict Values
)[2],
155 ALfloat (*restrict Coeffs
)[2],
156 ALfloat left
, ALfloat right
)
158 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
159 __m128 vals
= _mm_setzero_ps();
165 const ALuint o0
= Offset
&HRIR_MASK
;
166 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
169 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
170 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
171 imp0
= _mm_mul_ps(lrlr
, coeffs
);
172 vals
= _mm_add_ps(imp0
, vals
);
173 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
174 for(i
= 1;i
< IrSize
-1;i
+= 2)
176 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
178 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
179 vals
= _mm_load_ps(&Values
[o2
][0]);
180 imp1
= _mm_mul_ps(lrlr
, coeffs
);
181 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
182 vals
= _mm_add_ps(imp0
, vals
);
183 _mm_store_ps(&Values
[o2
][0], vals
);
186 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
187 imp0
= _mm_movehl_ps(imp0
, imp0
);
188 vals
= _mm_add_ps(imp0
, vals
);
189 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
193 for(i
= 0;i
< IrSize
;i
+= 2)
195 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
197 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
198 vals
= _mm_load_ps(&Values
[o
][0]);
199 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
200 _mm_store_ps(&Values
[o
][0], vals
);
205 #define MixHrtf MixHrtf_SSE
206 #include "mixer_inc.c"
210 void Mix_SSE(const ALfloat
*data
, ALuint OutChans
, ALfloat (*restrict OutBuffer
)[BUFFERSIZE
],
211 MixGains
*Gains
, ALuint Counter
, ALuint OutPos
, ALuint BufferSize
)
217 for(c
= 0;c
< OutChans
;c
++)
220 gain
= Gains
[c
].Current
;
221 step
= Gains
[c
].Step
;
222 if(step
!= 0.0f
&& Counter
> 0)
224 ALuint minsize
= minu(BufferSize
, Counter
);
225 /* Mix with applying gain steps in aligned multiples of 4. */
233 gain
+ step
+ step
+ step
235 step4
= _mm_set1_ps(step
+ step
+ step
+ step
);
237 const __m128 val4
= _mm_load_ps(&data
[pos
]);
238 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
239 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
240 gain4
= _mm_add_ps(gain4
, step4
);
241 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
243 } while(minsize
-pos
> 3);
244 /* NOTE: gain4 now represents the next four gains after the
245 * last four mixed samples, so the lowest element represents
246 * the next gain to apply.
248 gain
= _mm_cvtss_f32(gain4
);
250 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
251 for(;pos
< minsize
;pos
++)
253 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
257 gain
= Gains
[c
].Target
;
258 Gains
[c
].Current
= gain
;
260 /* Mix until pos is aligned with 4 or the mix is done. */
261 minsize
= minu(BufferSize
, (pos
+3)&~3);
262 for(;pos
< minsize
;pos
++)
263 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
266 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
268 gain4
= _mm_set1_ps(gain
);
269 for(;BufferSize
-pos
> 3;pos
+= 4)
271 const __m128 val4
= _mm_load_ps(&data
[pos
]);
272 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
273 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
274 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
276 for(;pos
< BufferSize
;pos
++)
277 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;