4 /* KDevelop's parser won't recognize these defines that get added by the -msse
5 * switch used to compile this source. Without them, xmmintrin.h fails to
10 #include <xmmintrin.h>
18 #include "alAuxEffectSlot.h"
19 #include "mixer_defs.h"
22 static inline void ApplyCoeffsStep(ALuint Offset
, ALfloat (*restrict Values
)[2],
24 ALfloat (*restrict Coeffs
)[2],
25 const ALfloat (*restrict CoeffStep
)[2],
26 ALfloat left
, ALfloat right
)
28 const __m128 lrlr
= _mm_set_ps(left
, right
, left
, right
);
29 __m128 coeffs
, deltas
, imp0
, imp1
;
30 __m128 vals
= _mm_setzero_ps();
35 const ALuint o0
= Offset
&HRIR_MASK
;
36 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
38 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
39 deltas
= _mm_load_ps(&CoeffStep
[0][0]);
40 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
41 imp0
= _mm_mul_ps(lrlr
, coeffs
);
42 coeffs
= _mm_add_ps(coeffs
, deltas
);
43 vals
= _mm_add_ps(imp0
, vals
);
44 _mm_store_ps(&Coeffs
[0][0], coeffs
);
45 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
46 for(i
= 1;i
< IrSize
-1;i
+= 2)
48 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
50 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
51 deltas
= _mm_load_ps(&CoeffStep
[i
+1][0]);
52 vals
= _mm_load_ps(&Values
[o2
][0]);
53 imp1
= _mm_mul_ps(lrlr
, coeffs
);
54 coeffs
= _mm_add_ps(coeffs
, deltas
);
55 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
56 vals
= _mm_add_ps(imp0
, vals
);
57 _mm_store_ps(&Coeffs
[i
+1][0], coeffs
);
58 _mm_store_ps(&Values
[o2
][0], vals
);
61 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
62 imp0
= _mm_movehl_ps(imp0
, imp0
);
63 vals
= _mm_add_ps(imp0
, vals
);
64 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
68 for(i
= 0;i
< IrSize
;i
+= 2)
70 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
72 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
73 deltas
= _mm_load_ps(&CoeffStep
[i
][0]);
74 vals
= _mm_load_ps(&Values
[o
][0]);
75 imp0
= _mm_mul_ps(lrlr
, coeffs
);
76 coeffs
= _mm_add_ps(coeffs
, deltas
);
77 vals
= _mm_add_ps(imp0
, vals
);
78 _mm_store_ps(&Coeffs
[i
][0], coeffs
);
79 _mm_store_ps(&Values
[o
][0], vals
);
84 static inline void ApplyCoeffs(ALuint Offset
, ALfloat (*restrict Values
)[2],
86 ALfloat (*restrict Coeffs
)[2],
87 ALfloat left
, ALfloat right
)
89 const __m128 lrlr
= _mm_set_ps(left
, right
, left
, right
);
90 __m128 vals
= _mm_setzero_ps();
96 const ALuint o0
= Offset
&HRIR_MASK
;
97 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
100 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
101 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
102 imp0
= _mm_mul_ps(lrlr
, coeffs
);
103 vals
= _mm_add_ps(imp0
, vals
);
104 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
105 for(i
= 1;i
< IrSize
-1;i
+= 2)
107 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
109 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
110 vals
= _mm_load_ps(&Values
[o2
][0]);
111 imp1
= _mm_mul_ps(lrlr
, coeffs
);
112 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
113 vals
= _mm_add_ps(imp0
, vals
);
114 _mm_store_ps(&Values
[o2
][0], vals
);
117 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
118 imp0
= _mm_movehl_ps(imp0
, imp0
);
119 vals
= _mm_add_ps(imp0
, vals
);
120 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
124 for(i
= 0;i
< IrSize
;i
+= 2)
126 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
128 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
129 vals
= _mm_load_ps(&Values
[o
][0]);
130 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
131 _mm_store_ps(&Values
[o
][0], vals
);
137 #include "mixer_inc.c"
141 void MixDirect_SSE(DirectParams
*params
, const ALfloat
*restrict data
, ALuint srcchan
,
142 ALuint OutPos
, ALuint BufferSize
)
144 ALfloat (*restrict OutBuffer
)[BUFFERSIZE
] = params
->OutBuffer
;
145 ALuint Counter
= maxu(params
->Counter
, OutPos
) - OutPos
;
146 ALfloat DrySend
, Step
;
150 for(c
= 0;c
< MaxChannels
;c
++)
153 DrySend
= params
->Mix
.Gains
.Current
[srcchan
][c
];
154 Step
= params
->Mix
.Gains
.Step
[srcchan
][c
];
155 if(Step
!= 1.0f
&& Counter
> 0)
157 /* Mix with applying gain steps in aligned multiples of 4. */
158 if(BufferSize
-pos
> 3 && Counter
-pos
> 3)
163 DrySend
* Step
* Step
,
164 DrySend
* Step
* Step
* Step
166 step
= _mm_set1_ps(Step
* Step
* Step
* Step
);
168 const __m128 val4
= _mm_load_ps(&data
[pos
]);
169 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
170 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain
));
171 gain
= _mm_mul_ps(gain
, step
);
172 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
174 } while(BufferSize
-pos
> 3 && Counter
-pos
> 3);
175 DrySend
= _mm_cvtss_f32(_mm_shuffle_ps(gain
, gain
, _MM_SHUFFLE(3, 3, 3, 3)));
177 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
178 for(;pos
< BufferSize
&& pos
< Counter
;pos
++)
180 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*DrySend
;
184 DrySend
= params
->Mix
.Gains
.Target
[srcchan
][c
];
185 params
->Mix
.Gains
.Current
[srcchan
][c
] = DrySend
;
186 /* Mix until pos is aligned with 4 or the mix is done. */
187 for(;pos
< BufferSize
&& (pos
&3) != 0;pos
++)
188 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*DrySend
;
191 if(!(DrySend
> GAIN_SILENCE_THRESHOLD
))
193 gain
= _mm_set1_ps(DrySend
);
194 for(;BufferSize
-pos
> 3;pos
+= 4)
196 const __m128 val4
= _mm_load_ps(&data
[pos
]);
197 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
198 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain
));
199 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
201 for(;pos
< BufferSize
;pos
++)
202 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*DrySend
;
207 void MixSend_SSE(SendParams
*params
, const ALfloat
*restrict data
,
208 ALuint OutPos
, ALuint BufferSize
)
210 ALfloat (*restrict OutBuffer
)[BUFFERSIZE
] = params
->OutBuffer
;
211 ALuint Counter
= maxu(params
->Counter
, OutPos
) - OutPos
;
212 ALfloat WetGain
, Step
;
217 WetGain
= params
->Gain
.Current
;
218 Step
= params
->Gain
.Step
;
219 if(Step
!= 1.0f
&& Counter
> 0)
221 if(BufferSize
-pos
> 3 && Counter
-pos
> 3)
226 WetGain
* Step
* Step
,
227 WetGain
* Step
* Step
* Step
229 step
= _mm_set1_ps(Step
* Step
* Step
* Step
);
231 const __m128 val4
= _mm_load_ps(&data
[pos
]);
232 __m128 dry4
= _mm_load_ps(&OutBuffer
[0][OutPos
+pos
]);
233 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain
));
234 gain
= _mm_mul_ps(gain
, step
);
235 _mm_store_ps(&OutBuffer
[0][OutPos
+pos
], dry4
);
237 } while(BufferSize
-pos
> 3 && Counter
-pos
> 3);
238 WetGain
= _mm_cvtss_f32(_mm_shuffle_ps(gain
, gain
, _MM_SHUFFLE(3, 3, 3, 3)));
240 for(;pos
< BufferSize
&& pos
< Counter
;pos
++)
242 OutBuffer
[0][OutPos
+pos
] += data
[pos
]*WetGain
;
246 WetGain
= params
->Gain
.Target
;
247 params
->Gain
.Current
= WetGain
;
248 for(;pos
< BufferSize
&& (pos
&3) != 0;pos
++)
249 OutBuffer
[0][OutPos
+pos
] += data
[pos
]*WetGain
;
252 if(!(WetGain
> GAIN_SILENCE_THRESHOLD
))
254 gain
= _mm_set1_ps(WetGain
);
255 for(;BufferSize
-pos
> 3;pos
+= 4)
257 const __m128 val4
= _mm_load_ps(&data
[pos
]);
258 __m128 wet4
= _mm_load_ps(&OutBuffer
[0][OutPos
+pos
]);
259 wet4
= _mm_add_ps(wet4
, _mm_mul_ps(val4
, gain
));
260 _mm_store_ps(&OutBuffer
[0][OutPos
+pos
], wet4
);
262 for(;pos
< BufferSize
;pos
++)
263 OutBuffer
[0][OutPos
+pos
] += data
[pos
] * WetGain
;