11 #include "alAuxEffectSlot.h"
12 #include "mixer_defs.h"
15 const ALfloat
*Resample_bsinc32_SSE(const BsincState
*state
, const ALfloat
*src
, ALuint frac
,
16 ALuint increment
, ALfloat
*restrict dst
, ALuint dstlen
)
18 const __m128 sf4
= _mm_set1_ps(state
->sf
);
19 const ALuint m
= state
->m
;
20 const ALint l
= state
->l
;
21 const ALfloat
*fil
, *scd
, *phd
, *spd
;
27 for(i
= 0;i
< dstlen
;i
++)
29 // Calculate the phase index and factor.
30 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
31 pi
= frac
>> FRAC_PHASE_BITDIFF
;
32 pf
= (frac
& ((1<<FRAC_PHASE_BITDIFF
)-1)) * (1.0f
/(1<<FRAC_PHASE_BITDIFF
));
33 #undef FRAC_PHASE_BITDIFF
35 fil
= state
->coeffs
[pi
].filter
;
36 scd
= state
->coeffs
[pi
].scDelta
;
37 phd
= state
->coeffs
[pi
].phDelta
;
38 spd
= state
->coeffs
[pi
].spDelta
;
40 // Apply the scale and phase interpolated filter.
41 r4
= _mm_setzero_ps();
43 const __m128 pf4
= _mm_set1_ps(pf
);
44 for(j_f
= 0,j_s
= l
;j_f
< m
;j_f
+=4,j_s
+=4)
46 const __m128 f4
= _mm_add_ps(
48 _mm_load_ps(&fil
[j_f
]),
49 _mm_mul_ps(sf4
, _mm_load_ps(&scd
[j_f
]))
54 _mm_load_ps(&phd
[j_f
]),
55 _mm_mul_ps(sf4
, _mm_load_ps(&spd
[j_f
]))
59 r4
= _mm_add_ps(r4
, _mm_mul_ps(f4
, _mm_loadu_ps(&src
[j_s
])));
62 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
63 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
64 dst
[i
] = _mm_cvtss_f32(r4
);
67 src
+= frac
>>FRACTIONBITS
;
74 static inline void ApplyCoeffsStep(ALuint Offset
, ALfloat (*restrict Values
)[2],
76 ALfloat (*restrict Coeffs
)[2],
77 const ALfloat (*restrict CoeffStep
)[2],
78 ALfloat left
, ALfloat right
)
80 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
81 __m128 coeffs
, deltas
, imp0
, imp1
;
82 __m128 vals
= _mm_setzero_ps();
87 const ALuint o0
= Offset
&HRIR_MASK
;
88 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
90 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
91 deltas
= _mm_load_ps(&CoeffStep
[0][0]);
92 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
93 imp0
= _mm_mul_ps(lrlr
, coeffs
);
94 coeffs
= _mm_add_ps(coeffs
, deltas
);
95 vals
= _mm_add_ps(imp0
, vals
);
96 _mm_store_ps(&Coeffs
[0][0], coeffs
);
97 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
98 for(i
= 1;i
< IrSize
-1;i
+= 2)
100 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
102 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
103 deltas
= _mm_load_ps(&CoeffStep
[i
+1][0]);
104 vals
= _mm_load_ps(&Values
[o2
][0]);
105 imp1
= _mm_mul_ps(lrlr
, coeffs
);
106 coeffs
= _mm_add_ps(coeffs
, deltas
);
107 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
108 vals
= _mm_add_ps(imp0
, vals
);
109 _mm_store_ps(&Coeffs
[i
+1][0], coeffs
);
110 _mm_store_ps(&Values
[o2
][0], vals
);
113 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
114 imp0
= _mm_movehl_ps(imp0
, imp0
);
115 vals
= _mm_add_ps(imp0
, vals
);
116 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
120 for(i
= 0;i
< IrSize
;i
+= 2)
122 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
124 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
125 deltas
= _mm_load_ps(&CoeffStep
[i
][0]);
126 vals
= _mm_load_ps(&Values
[o
][0]);
127 imp0
= _mm_mul_ps(lrlr
, coeffs
);
128 coeffs
= _mm_add_ps(coeffs
, deltas
);
129 vals
= _mm_add_ps(imp0
, vals
);
130 _mm_store_ps(&Coeffs
[i
][0], coeffs
);
131 _mm_store_ps(&Values
[o
][0], vals
);
136 static inline void ApplyCoeffs(ALuint Offset
, ALfloat (*restrict Values
)[2],
138 ALfloat (*restrict Coeffs
)[2],
139 ALfloat left
, ALfloat right
)
141 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
142 __m128 vals
= _mm_setzero_ps();
148 const ALuint o0
= Offset
&HRIR_MASK
;
149 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
152 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
153 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
154 imp0
= _mm_mul_ps(lrlr
, coeffs
);
155 vals
= _mm_add_ps(imp0
, vals
);
156 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
157 for(i
= 1;i
< IrSize
-1;i
+= 2)
159 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
161 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
162 vals
= _mm_load_ps(&Values
[o2
][0]);
163 imp1
= _mm_mul_ps(lrlr
, coeffs
);
164 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
165 vals
= _mm_add_ps(imp0
, vals
);
166 _mm_store_ps(&Values
[o2
][0], vals
);
169 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
170 imp0
= _mm_movehl_ps(imp0
, imp0
);
171 vals
= _mm_add_ps(imp0
, vals
);
172 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
176 for(i
= 0;i
< IrSize
;i
+= 2)
178 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
180 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
181 vals
= _mm_load_ps(&Values
[o
][0]);
182 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
183 _mm_store_ps(&Values
[o
][0], vals
);
188 #define MixHrtf MixHrtf_SSE
189 #define MixDirectHrtf MixDirectHrtf_SSE
190 #include "mixer_inc.c"
194 void Mix_SSE(const ALfloat
*data
, ALuint OutChans
, ALfloat (*restrict OutBuffer
)[BUFFERSIZE
],
195 MixGains
*Gains
, ALuint Counter
, ALuint OutPos
, ALuint BufferSize
)
201 for(c
= 0;c
< OutChans
;c
++)
204 gain
= Gains
[c
].Current
;
205 step
= Gains
[c
].Step
;
206 if(step
!= 0.0f
&& Counter
> 0)
208 ALuint minsize
= minu(BufferSize
, Counter
);
209 /* Mix with applying gain steps in aligned multiples of 4. */
217 gain
+ step
+ step
+ step
219 step4
= _mm_set1_ps(step
+ step
+ step
+ step
);
221 const __m128 val4
= _mm_load_ps(&data
[pos
]);
222 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
223 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
224 gain4
= _mm_add_ps(gain4
, step4
);
225 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
227 } while(minsize
-pos
> 3);
228 /* NOTE: gain4 now represents the next four gains after the
229 * last four mixed samples, so the lowest element represents
230 * the next gain to apply.
232 gain
= _mm_cvtss_f32(gain4
);
234 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
235 for(;pos
< minsize
;pos
++)
237 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
241 gain
= Gains
[c
].Target
;
242 Gains
[c
].Current
= gain
;
244 /* Mix until pos is aligned with 4 or the mix is done. */
245 minsize
= minu(BufferSize
, (pos
+3)&~3);
246 for(;pos
< minsize
;pos
++)
247 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
250 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
252 gain4
= _mm_set1_ps(gain
);
253 for(;BufferSize
-pos
> 3;pos
+= 4)
255 const __m128 val4
= _mm_load_ps(&data
[pos
]);
256 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
257 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
258 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
260 for(;pos
< BufferSize
;pos
++)
261 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
265 void MixRow_SSE(ALfloat
*OutBuffer
, const ALfloat
*Gains
, const ALfloat (*restrict data
)[BUFFERSIZE
], ALuint InChans
, ALuint InPos
, ALuint BufferSize
)
270 for(c
= 0;c
< InChans
;c
++)
273 ALfloat gain
= Gains
[c
];
274 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
277 gain4
= _mm_set1_ps(gain
);
278 for(;BufferSize
-pos
> 3;pos
+= 4)
280 const __m128 val4
= _mm_load_ps(&data
[c
][InPos
+pos
]);
281 __m128 dry4
= _mm_load_ps(&OutBuffer
[pos
]);
282 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
283 _mm_store_ps(&OutBuffer
[pos
], dry4
);
285 for(;pos
< BufferSize
;pos
++)
286 OutBuffer
[pos
] += data
[c
][InPos
+pos
]*gain
;