11 #include "alAuxEffectSlot.h"
12 #include "mixer_defs.h"
15 const ALfloat
*Resample_bsinc32_SSE(const BsincState
*state
, const ALfloat
*restrict src
,
16 ALuint frac
, ALuint increment
, ALfloat
*restrict dst
,
19 const __m128 sf4
= _mm_set1_ps(state
->sf
);
20 const ALuint m
= state
->m
;
21 const ALint l
= state
->l
;
22 const ALfloat
*fil
, *scd
, *phd
, *spd
;
28 for(i
= 0;i
< dstlen
;i
++)
30 // Calculate the phase index and factor.
31 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
32 pi
= frac
>> FRAC_PHASE_BITDIFF
;
33 pf
= (frac
& ((1<<FRAC_PHASE_BITDIFF
)-1)) * (1.0f
/(1<<FRAC_PHASE_BITDIFF
));
34 #undef FRAC_PHASE_BITDIFF
36 fil
= state
->coeffs
[pi
].filter
;
37 scd
= state
->coeffs
[pi
].scDelta
;
38 phd
= state
->coeffs
[pi
].phDelta
;
39 spd
= state
->coeffs
[pi
].spDelta
;
41 // Apply the scale and phase interpolated filter.
42 r4
= _mm_setzero_ps();
44 const __m128 pf4
= _mm_set1_ps(pf
);
45 for(j_f
= 0,j_s
= l
;j_f
< m
;j_f
+=4,j_s
+=4)
47 const __m128 f4
= _mm_add_ps(
49 _mm_load_ps(&fil
[j_f
]),
50 _mm_mul_ps(sf4
, _mm_load_ps(&scd
[j_f
]))
55 _mm_load_ps(&phd
[j_f
]),
56 _mm_mul_ps(sf4
, _mm_load_ps(&spd
[j_f
]))
60 r4
= _mm_add_ps(r4
, _mm_mul_ps(f4
, _mm_loadu_ps(&src
[j_s
])));
63 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
64 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
65 dst
[i
] = _mm_cvtss_f32(r4
);
68 src
+= frac
>>FRACTIONBITS
;
75 static inline void ApplyCoeffsStep(ALuint Offset
, ALfloat (*restrict Values
)[2],
77 ALfloat (*restrict Coeffs
)[2],
78 const ALfloat (*restrict CoeffStep
)[2],
79 ALfloat left
, ALfloat right
)
81 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
82 __m128 coeffs
, deltas
, imp0
, imp1
;
83 __m128 vals
= _mm_setzero_ps();
88 const ALuint o0
= Offset
&HRIR_MASK
;
89 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
91 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
92 deltas
= _mm_load_ps(&CoeffStep
[0][0]);
93 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
94 imp0
= _mm_mul_ps(lrlr
, coeffs
);
95 coeffs
= _mm_add_ps(coeffs
, deltas
);
96 vals
= _mm_add_ps(imp0
, vals
);
97 _mm_store_ps(&Coeffs
[0][0], coeffs
);
98 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
99 for(i
= 1;i
< IrSize
-1;i
+= 2)
101 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
103 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
104 deltas
= _mm_load_ps(&CoeffStep
[i
+1][0]);
105 vals
= _mm_load_ps(&Values
[o2
][0]);
106 imp1
= _mm_mul_ps(lrlr
, coeffs
);
107 coeffs
= _mm_add_ps(coeffs
, deltas
);
108 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
109 vals
= _mm_add_ps(imp0
, vals
);
110 _mm_store_ps(&Coeffs
[i
+1][0], coeffs
);
111 _mm_store_ps(&Values
[o2
][0], vals
);
114 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
115 imp0
= _mm_movehl_ps(imp0
, imp0
);
116 vals
= _mm_add_ps(imp0
, vals
);
117 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
121 for(i
= 0;i
< IrSize
;i
+= 2)
123 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
125 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
126 deltas
= _mm_load_ps(&CoeffStep
[i
][0]);
127 vals
= _mm_load_ps(&Values
[o
][0]);
128 imp0
= _mm_mul_ps(lrlr
, coeffs
);
129 coeffs
= _mm_add_ps(coeffs
, deltas
);
130 vals
= _mm_add_ps(imp0
, vals
);
131 _mm_store_ps(&Coeffs
[i
][0], coeffs
);
132 _mm_store_ps(&Values
[o
][0], vals
);
137 static inline void ApplyCoeffs(ALuint Offset
, ALfloat (*restrict Values
)[2],
139 ALfloat (*restrict Coeffs
)[2],
140 ALfloat left
, ALfloat right
)
142 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
143 __m128 vals
= _mm_setzero_ps();
149 const ALuint o0
= Offset
&HRIR_MASK
;
150 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
153 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
154 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
155 imp0
= _mm_mul_ps(lrlr
, coeffs
);
156 vals
= _mm_add_ps(imp0
, vals
);
157 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
158 for(i
= 1;i
< IrSize
-1;i
+= 2)
160 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
162 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
163 vals
= _mm_load_ps(&Values
[o2
][0]);
164 imp1
= _mm_mul_ps(lrlr
, coeffs
);
165 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
166 vals
= _mm_add_ps(imp0
, vals
);
167 _mm_store_ps(&Values
[o2
][0], vals
);
170 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
171 imp0
= _mm_movehl_ps(imp0
, imp0
);
172 vals
= _mm_add_ps(imp0
, vals
);
173 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
177 for(i
= 0;i
< IrSize
;i
+= 2)
179 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
181 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
182 vals
= _mm_load_ps(&Values
[o
][0]);
183 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
184 _mm_store_ps(&Values
[o
][0], vals
);
189 #define MixHrtf MixHrtf_SSE
190 #define MixDirectHrtf MixDirectHrtf_SSE
191 #include "mixer_inc.c"
195 void Mix_SSE(const ALfloat
*data
, ALuint OutChans
, ALfloat (*restrict OutBuffer
)[BUFFERSIZE
],
196 ALfloat
*CurrentGains
, const ALfloat
*TargetGains
, ALuint Counter
, ALuint OutPos
,
199 ALfloat gain
, delta
, step
;
203 delta
= (Counter
> 0) ? 1.0f
/(ALfloat
)Counter
: 0.0f
;
205 for(c
= 0;c
< OutChans
;c
++)
208 gain
= CurrentGains
[c
];
209 step
= (TargetGains
[c
] - gain
) * delta
;
210 if(fabsf(step
) > FLT_EPSILON
)
212 ALuint minsize
= minu(BufferSize
, Counter
);
213 /* Mix with applying gain steps in aligned multiples of 4. */
221 gain
+ step
+ step
+ step
223 step4
= _mm_set1_ps(step
+ step
+ step
+ step
);
225 const __m128 val4
= _mm_load_ps(&data
[pos
]);
226 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
227 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
228 gain4
= _mm_add_ps(gain4
, step4
);
229 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
231 } while(minsize
-pos
> 3);
232 /* NOTE: gain4 now represents the next four gains after the
233 * last four mixed samples, so the lowest element represents
234 * the next gain to apply.
236 gain
= _mm_cvtss_f32(gain4
);
238 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
239 for(;pos
< minsize
;pos
++)
241 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
245 gain
= TargetGains
[c
];
246 CurrentGains
[c
] = gain
;
248 /* Mix until pos is aligned with 4 or the mix is done. */
249 minsize
= minu(BufferSize
, (pos
+3)&~3);
250 for(;pos
< minsize
;pos
++)
251 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
254 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
256 gain4
= _mm_set1_ps(gain
);
257 for(;BufferSize
-pos
> 3;pos
+= 4)
259 const __m128 val4
= _mm_load_ps(&data
[pos
]);
260 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
261 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
262 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
264 for(;pos
< BufferSize
;pos
++)
265 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
269 void MixRow_SSE(ALfloat
*OutBuffer
, const ALfloat
*Gains
, const ALfloat (*restrict data
)[BUFFERSIZE
], ALuint InChans
, ALuint InPos
, ALuint BufferSize
)
274 for(c
= 0;c
< InChans
;c
++)
277 ALfloat gain
= Gains
[c
];
278 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
281 gain4
= _mm_set1_ps(gain
);
282 for(;BufferSize
-pos
> 3;pos
+= 4)
284 const __m128 val4
= _mm_load_ps(&data
[c
][InPos
+pos
]);
285 __m128 dry4
= _mm_load_ps(&OutBuffer
[pos
]);
286 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
287 _mm_store_ps(&OutBuffer
[pos
], dry4
);
289 for(;pos
< BufferSize
;pos
++)
290 OutBuffer
[pos
] += data
[c
][InPos
+pos
]*gain
;