11 #include "alAuxEffectSlot.h"
12 #include "mixer_defs.h"
15 const ALfloat
*Resample_bsinc32_SSE(const BsincState
*state
, const ALfloat
*src
, ALuint frac
,
16 ALuint increment
, ALfloat
*restrict dst
, ALuint dstlen
)
18 const __m128 sf4
= _mm_set1_ps(state
->sf
);
19 const ALuint m
= state
->m
;
20 const ALint l
= state
->l
;
21 const ALfloat
*fil
, *scd
, *phd
, *spd
;
27 for(i
= 0;i
< dstlen
;i
++)
29 // Calculate the phase index and factor.
30 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
31 pi
= frac
>> FRAC_PHASE_BITDIFF
;
32 pf
= (frac
& ((1<<FRAC_PHASE_BITDIFF
)-1)) * (1.0f
/(1<<FRAC_PHASE_BITDIFF
));
33 #undef FRAC_PHASE_BITDIFF
35 fil
= state
->coeffs
[pi
].filter
;
36 scd
= state
->coeffs
[pi
].scDelta
;
37 phd
= state
->coeffs
[pi
].phDelta
;
38 spd
= state
->coeffs
[pi
].spDelta
;
40 // Apply the scale and phase interpolated filter.
41 r4
= _mm_setzero_ps();
43 const __m128 pf4
= _mm_set1_ps(pf
);
44 for(j_f
= 0,j_s
= l
;j_f
< m
;j_f
+=4,j_s
+=4)
46 const __m128 f4
= _mm_add_ps(
48 _mm_load_ps(&fil
[j_f
]),
49 _mm_mul_ps(sf4
, _mm_load_ps(&scd
[j_f
]))
54 _mm_load_ps(&phd
[j_f
]),
55 _mm_mul_ps(sf4
, _mm_load_ps(&spd
[j_f
]))
59 r4
= _mm_add_ps(r4
, _mm_mul_ps(f4
, _mm_loadu_ps(&src
[j_s
])));
62 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
63 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
64 dst
[i
] = _mm_cvtss_f32(r4
);
67 src
+= frac
>>FRACTIONBITS
;
74 static inline void ApplyCoeffsStep(ALuint Offset
, ALfloat (*restrict Values
)[2],
76 ALfloat (*restrict Coeffs
)[2],
77 const ALfloat (*restrict CoeffStep
)[2],
78 ALfloat left
, ALfloat right
)
80 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
81 __m128 coeffs
, deltas
, imp0
, imp1
;
82 __m128 vals
= _mm_setzero_ps();
87 const ALuint o0
= Offset
&HRIR_MASK
;
88 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
90 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
91 deltas
= _mm_load_ps(&CoeffStep
[0][0]);
92 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
93 imp0
= _mm_mul_ps(lrlr
, coeffs
);
94 coeffs
= _mm_add_ps(coeffs
, deltas
);
95 vals
= _mm_add_ps(imp0
, vals
);
96 _mm_store_ps(&Coeffs
[0][0], coeffs
);
97 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
98 for(i
= 1;i
< IrSize
-1;i
+= 2)
100 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
102 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
103 deltas
= _mm_load_ps(&CoeffStep
[i
+1][0]);
104 vals
= _mm_load_ps(&Values
[o2
][0]);
105 imp1
= _mm_mul_ps(lrlr
, coeffs
);
106 coeffs
= _mm_add_ps(coeffs
, deltas
);
107 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
108 vals
= _mm_add_ps(imp0
, vals
);
109 _mm_store_ps(&Coeffs
[i
+1][0], coeffs
);
110 _mm_store_ps(&Values
[o2
][0], vals
);
113 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
114 imp0
= _mm_movehl_ps(imp0
, imp0
);
115 vals
= _mm_add_ps(imp0
, vals
);
116 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
120 for(i
= 0;i
< IrSize
;i
+= 2)
122 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
124 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
125 deltas
= _mm_load_ps(&CoeffStep
[i
][0]);
126 vals
= _mm_load_ps(&Values
[o
][0]);
127 imp0
= _mm_mul_ps(lrlr
, coeffs
);
128 coeffs
= _mm_add_ps(coeffs
, deltas
);
129 vals
= _mm_add_ps(imp0
, vals
);
130 _mm_store_ps(&Coeffs
[i
][0], coeffs
);
131 _mm_store_ps(&Values
[o
][0], vals
);
136 static inline void ApplyCoeffs(ALuint Offset
, ALfloat (*restrict Values
)[2],
138 ALfloat (*restrict Coeffs
)[2],
139 ALfloat left
, ALfloat right
)
141 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
142 __m128 vals
= _mm_setzero_ps();
148 const ALuint o0
= Offset
&HRIR_MASK
;
149 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
152 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
153 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
154 imp0
= _mm_mul_ps(lrlr
, coeffs
);
155 vals
= _mm_add_ps(imp0
, vals
);
156 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
157 for(i
= 1;i
< IrSize
-1;i
+= 2)
159 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
161 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
162 vals
= _mm_load_ps(&Values
[o2
][0]);
163 imp1
= _mm_mul_ps(lrlr
, coeffs
);
164 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
165 vals
= _mm_add_ps(imp0
, vals
);
166 _mm_store_ps(&Values
[o2
][0], vals
);
169 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
170 imp0
= _mm_movehl_ps(imp0
, imp0
);
171 vals
= _mm_add_ps(imp0
, vals
);
172 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
176 for(i
= 0;i
< IrSize
;i
+= 2)
178 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
180 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
181 vals
= _mm_load_ps(&Values
[o
][0]);
182 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
183 _mm_store_ps(&Values
[o
][0], vals
);
188 #define MixHrtf MixHrtf_SSE
189 #include "mixer_inc.c"
193 void Mix_SSE(const ALfloat
*data
, ALuint OutChans
, ALfloat (*restrict OutBuffer
)[BUFFERSIZE
],
194 MixGains
*Gains
, ALuint Counter
, ALuint OutPos
, ALuint BufferSize
)
200 for(c
= 0;c
< OutChans
;c
++)
203 gain
= Gains
[c
].Current
;
204 step
= Gains
[c
].Step
;
205 if(step
!= 0.0f
&& Counter
> 0)
207 ALuint minsize
= minu(BufferSize
, Counter
);
208 /* Mix with applying gain steps in aligned multiples of 4. */
216 gain
+ step
+ step
+ step
218 step4
= _mm_set1_ps(step
+ step
+ step
+ step
);
220 const __m128 val4
= _mm_load_ps(&data
[pos
]);
221 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
222 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
223 gain4
= _mm_add_ps(gain4
, step4
);
224 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
226 } while(minsize
-pos
> 3);
227 /* NOTE: gain4 now represents the next four gains after the
228 * last four mixed samples, so the lowest element represents
229 * the next gain to apply.
231 gain
= _mm_cvtss_f32(gain4
);
233 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
234 for(;pos
< minsize
;pos
++)
236 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
240 gain
= Gains
[c
].Target
;
241 Gains
[c
].Current
= gain
;
243 /* Mix until pos is aligned with 4 or the mix is done. */
244 minsize
= minu(BufferSize
, (pos
+3)&~3);
245 for(;pos
< minsize
;pos
++)
246 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
249 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
251 gain4
= _mm_set1_ps(gain
);
252 for(;BufferSize
-pos
> 3;pos
+= 4)
254 const __m128 val4
= _mm_load_ps(&data
[pos
]);
255 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
256 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
257 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
259 for(;pos
< BufferSize
;pos
++)
260 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;