4 /* KDevelop's parser won't recognize these defines that get added by the -msse
5 * switch used to compile this source. Without them, xmmintrin.h fails to
10 #include <xmmintrin.h>
18 #include "alAuxEffectSlot.h"
19 #include "mixer_defs.h"
22 static inline void SetupCoeffs(ALfloat (*restrict OutCoeffs
)[2],
23 const HrtfParams
*hrtfparams
,
24 ALuint IrSize
, ALuint Counter
)
26 const __m128 counter4
= _mm_set1_ps((float)Counter
);
30 for(i
= 0;i
< IrSize
;i
+= 2)
32 step4
= _mm_load_ps(&hrtfparams
->CoeffStep
[i
][0]);
33 coeffs
= _mm_load_ps(&hrtfparams
->Coeffs
[i
][0]);
34 coeffs
= _mm_sub_ps(coeffs
, _mm_mul_ps(step4
, counter4
));
35 _mm_store_ps(&OutCoeffs
[i
][0], coeffs
);
39 static inline void ApplyCoeffsStep(ALuint Offset
, ALfloat (*restrict Values
)[2],
41 ALfloat (*restrict Coeffs
)[2],
42 const ALfloat (*restrict CoeffStep
)[2],
43 ALfloat left
, ALfloat right
)
45 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
46 __m128 coeffs
, deltas
, imp0
, imp1
;
47 __m128 vals
= _mm_setzero_ps();
52 const ALuint o0
= Offset
&HRIR_MASK
;
53 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
55 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
56 deltas
= _mm_load_ps(&CoeffStep
[0][0]);
57 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
58 imp0
= _mm_mul_ps(lrlr
, coeffs
);
59 coeffs
= _mm_add_ps(coeffs
, deltas
);
60 vals
= _mm_add_ps(imp0
, vals
);
61 _mm_store_ps(&Coeffs
[0][0], coeffs
);
62 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
63 for(i
= 1;i
< IrSize
-1;i
+= 2)
65 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
67 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
68 deltas
= _mm_load_ps(&CoeffStep
[i
+1][0]);
69 vals
= _mm_load_ps(&Values
[o2
][0]);
70 imp1
= _mm_mul_ps(lrlr
, coeffs
);
71 coeffs
= _mm_add_ps(coeffs
, deltas
);
72 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
73 vals
= _mm_add_ps(imp0
, vals
);
74 _mm_store_ps(&Coeffs
[i
+1][0], coeffs
);
75 _mm_store_ps(&Values
[o2
][0], vals
);
78 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
79 imp0
= _mm_movehl_ps(imp0
, imp0
);
80 vals
= _mm_add_ps(imp0
, vals
);
81 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
85 for(i
= 0;i
< IrSize
;i
+= 2)
87 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
89 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
90 deltas
= _mm_load_ps(&CoeffStep
[i
][0]);
91 vals
= _mm_load_ps(&Values
[o
][0]);
92 imp0
= _mm_mul_ps(lrlr
, coeffs
);
93 coeffs
= _mm_add_ps(coeffs
, deltas
);
94 vals
= _mm_add_ps(imp0
, vals
);
95 _mm_store_ps(&Coeffs
[i
][0], coeffs
);
96 _mm_store_ps(&Values
[o
][0], vals
);
101 static inline void ApplyCoeffs(ALuint Offset
, ALfloat (*restrict Values
)[2],
103 ALfloat (*restrict Coeffs
)[2],
104 ALfloat left
, ALfloat right
)
106 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
107 __m128 vals
= _mm_setzero_ps();
113 const ALuint o0
= Offset
&HRIR_MASK
;
114 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
117 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
118 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
119 imp0
= _mm_mul_ps(lrlr
, coeffs
);
120 vals
= _mm_add_ps(imp0
, vals
);
121 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
122 for(i
= 1;i
< IrSize
-1;i
+= 2)
124 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
126 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
127 vals
= _mm_load_ps(&Values
[o2
][0]);
128 imp1
= _mm_mul_ps(lrlr
, coeffs
);
129 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
130 vals
= _mm_add_ps(imp0
, vals
);
131 _mm_store_ps(&Values
[o2
][0], vals
);
134 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
135 imp0
= _mm_movehl_ps(imp0
, imp0
);
136 vals
= _mm_add_ps(imp0
, vals
);
137 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
141 for(i
= 0;i
< IrSize
;i
+= 2)
143 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
145 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
146 vals
= _mm_load_ps(&Values
[o
][0]);
147 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
148 _mm_store_ps(&Values
[o
][0], vals
);
154 #include "mixer_inc.c"
158 void Mix_SSE(const ALfloat
*data
, ALuint OutChans
, ALfloat (*restrict OutBuffer
)[BUFFERSIZE
],
159 MixGains
*Gains
, ALuint Counter
, ALuint OutPos
, ALuint BufferSize
)
165 for(c
= 0;c
< OutChans
;c
++)
168 gain
= Gains
[c
].Current
;
169 step
= Gains
[c
].Step
;
170 if(step
!= 0.0f
&& Counter
> 0)
172 /* Mix with applying gain steps in aligned multiples of 4. */
173 if(BufferSize
-pos
> 3 && Counter
-pos
> 3)
179 gain
+ step
+ step
+ step
181 step4
= _mm_set1_ps(step
+ step
+ step
+ step
);
183 const __m128 val4
= _mm_load_ps(&data
[pos
]);
184 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
185 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
186 gain4
= _mm_add_ps(gain4
, step4
);
187 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
189 } while(BufferSize
-pos
> 3 && Counter
-pos
> 3);
190 gain
= _mm_cvtss_f32(gain4
);
192 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
193 for(;pos
< BufferSize
&& pos
< Counter
;pos
++)
195 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
199 gain
= Gains
[c
].Target
;
200 Gains
[c
].Current
= gain
;
201 /* Mix until pos is aligned with 4 or the mix is done. */
202 for(;pos
< BufferSize
&& (pos
&3) != 0;pos
++)
203 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
206 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
208 gain4
= _mm_set1_ps(gain
);
209 for(;BufferSize
-pos
> 3;pos
+= 4)
211 const __m128 val4
= _mm_load_ps(&data
[pos
]);
212 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
213 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
214 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
216 for(;pos
< BufferSize
;pos
++)
217 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;