4 /* KDevelop's parser won't recognize these defines that get added by the -msse
5 * switch used to compile this source. Without them, xmmintrin.h fails to
10 #include <xmmintrin.h>
18 #include "alAuxEffectSlot.h"
19 #include "mixer_defs.h"
22 static inline void ApplyCoeffsStep(ALuint Offset
, ALfloat (*restrict Values
)[2],
24 ALfloat (*restrict Coeffs
)[2],
25 const ALfloat (*restrict CoeffStep
)[2],
26 ALfloat left
, ALfloat right
)
28 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
29 __m128 coeffs
, deltas
, imp0
, imp1
;
30 __m128 vals
= _mm_setzero_ps();
35 const ALuint o0
= Offset
&HRIR_MASK
;
36 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
38 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
39 deltas
= _mm_load_ps(&CoeffStep
[0][0]);
40 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
41 imp0
= _mm_mul_ps(lrlr
, coeffs
);
42 coeffs
= _mm_add_ps(coeffs
, deltas
);
43 vals
= _mm_add_ps(imp0
, vals
);
44 _mm_store_ps(&Coeffs
[0][0], coeffs
);
45 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
46 for(i
= 1;i
< IrSize
-1;i
+= 2)
48 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
50 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
51 deltas
= _mm_load_ps(&CoeffStep
[i
+1][0]);
52 vals
= _mm_load_ps(&Values
[o2
][0]);
53 imp1
= _mm_mul_ps(lrlr
, coeffs
);
54 coeffs
= _mm_add_ps(coeffs
, deltas
);
55 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
56 vals
= _mm_add_ps(imp0
, vals
);
57 _mm_store_ps(&Coeffs
[i
+1][0], coeffs
);
58 _mm_store_ps(&Values
[o2
][0], vals
);
61 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
62 imp0
= _mm_movehl_ps(imp0
, imp0
);
63 vals
= _mm_add_ps(imp0
, vals
);
64 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
68 for(i
= 0;i
< IrSize
;i
+= 2)
70 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
72 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
73 deltas
= _mm_load_ps(&CoeffStep
[i
][0]);
74 vals
= _mm_load_ps(&Values
[o
][0]);
75 imp0
= _mm_mul_ps(lrlr
, coeffs
);
76 coeffs
= _mm_add_ps(coeffs
, deltas
);
77 vals
= _mm_add_ps(imp0
, vals
);
78 _mm_store_ps(&Coeffs
[i
][0], coeffs
);
79 _mm_store_ps(&Values
[o
][0], vals
);
84 static inline void ApplyCoeffs(ALuint Offset
, ALfloat (*restrict Values
)[2],
86 ALfloat (*restrict Coeffs
)[2],
87 ALfloat left
, ALfloat right
)
89 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
90 __m128 vals
= _mm_setzero_ps();
96 const ALuint o0
= Offset
&HRIR_MASK
;
97 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
100 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
101 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
102 imp0
= _mm_mul_ps(lrlr
, coeffs
);
103 vals
= _mm_add_ps(imp0
, vals
);
104 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
105 for(i
= 1;i
< IrSize
-1;i
+= 2)
107 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
109 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
110 vals
= _mm_load_ps(&Values
[o2
][0]);
111 imp1
= _mm_mul_ps(lrlr
, coeffs
);
112 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
113 vals
= _mm_add_ps(imp0
, vals
);
114 _mm_store_ps(&Values
[o2
][0], vals
);
117 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
118 imp0
= _mm_movehl_ps(imp0
, imp0
);
119 vals
= _mm_add_ps(imp0
, vals
);
120 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
124 for(i
= 0;i
< IrSize
;i
+= 2)
126 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
128 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
129 vals
= _mm_load_ps(&Values
[o
][0]);
130 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
131 _mm_store_ps(&Values
[o
][0], vals
);
137 #include "mixer_inc.c"
141 void MixDirect_SSE(ALfloat (*restrict OutBuffer
)[BUFFERSIZE
], const ALfloat
*data
,
142 MixGains
*Gains
, ALuint Counter
, ALuint OutPos
, ALuint BufferSize
)
144 ALfloat DrySend
, Step
;
148 for(c
= 0;c
< MaxChannels
;c
++)
151 DrySend
= Gains
->Current
[c
];
152 Step
= Gains
->Step
[c
];
153 if(Step
!= 1.0f
&& Counter
> 0)
155 /* Mix with applying gain steps in aligned multiples of 4. */
156 if(BufferSize
-pos
> 3 && Counter
-pos
> 3)
161 DrySend
* Step
* Step
,
162 DrySend
* Step
* Step
* Step
164 step
= _mm_set1_ps(Step
* Step
* Step
* Step
);
166 const __m128 val4
= _mm_load_ps(&data
[pos
]);
167 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
168 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain
));
169 gain
= _mm_mul_ps(gain
, step
);
170 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
172 } while(BufferSize
-pos
> 3 && Counter
-pos
> 3);
173 DrySend
= _mm_cvtss_f32(gain
);
175 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
176 for(;pos
< BufferSize
&& pos
< Counter
;pos
++)
178 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*DrySend
;
182 DrySend
= Gains
->Target
[c
];
183 Gains
->Current
[c
] = DrySend
;
184 /* Mix until pos is aligned with 4 or the mix is done. */
185 for(;pos
< BufferSize
&& (pos
&3) != 0;pos
++)
186 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*DrySend
;
189 if(!(DrySend
> GAIN_SILENCE_THRESHOLD
))
191 gain
= _mm_set1_ps(DrySend
);
192 for(;BufferSize
-pos
> 3;pos
+= 4)
194 const __m128 val4
= _mm_load_ps(&data
[pos
]);
195 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
196 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain
));
197 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
199 for(;pos
< BufferSize
;pos
++)
200 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*DrySend
;
205 void MixSend_SSE(ALfloat (*restrict OutBuffer
)[BUFFERSIZE
], const ALfloat
*data
,
206 MixGainMono
*Gain
, ALuint Counter
, ALuint OutPos
, ALuint BufferSize
)
208 ALfloat WetGain
, Step
;
213 WetGain
= Gain
->Current
;
215 if(Step
!= 1.0f
&& Counter
> 0)
217 if(BufferSize
-pos
> 3 && Counter
-pos
> 3)
222 WetGain
* Step
* Step
,
223 WetGain
* Step
* Step
* Step
225 step
= _mm_set1_ps(Step
* Step
* Step
* Step
);
227 const __m128 val4
= _mm_load_ps(&data
[pos
]);
228 __m128 dry4
= _mm_load_ps(&OutBuffer
[0][OutPos
+pos
]);
229 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain
));
230 gain
= _mm_mul_ps(gain
, step
);
231 _mm_store_ps(&OutBuffer
[0][OutPos
+pos
], dry4
);
233 } while(BufferSize
-pos
> 3 && Counter
-pos
> 3);
234 WetGain
= _mm_cvtss_f32(gain
);
236 for(;pos
< BufferSize
&& pos
< Counter
;pos
++)
238 OutBuffer
[0][OutPos
+pos
] += data
[pos
]*WetGain
;
242 WetGain
= Gain
->Target
;
243 Gain
->Current
= WetGain
;
244 for(;pos
< BufferSize
&& (pos
&3) != 0;pos
++)
245 OutBuffer
[0][OutPos
+pos
] += data
[pos
]*WetGain
;
248 if(!(WetGain
> GAIN_SILENCE_THRESHOLD
))
250 gain
= _mm_set1_ps(WetGain
);
251 for(;BufferSize
-pos
> 3;pos
+= 4)
253 const __m128 val4
= _mm_load_ps(&data
[pos
]);
254 __m128 wet4
= _mm_load_ps(&OutBuffer
[0][OutPos
+pos
]);
255 wet4
= _mm_add_ps(wet4
, _mm_mul_ps(val4
, gain
));
256 _mm_store_ps(&OutBuffer
[0][OutPos
+pos
], wet4
);
258 for(;pos
< BufferSize
;pos
++)
259 OutBuffer
[0][OutPos
+pos
] += data
[pos
] * WetGain
;