4 /* KDevelop's parser won't recognize these defines that get added by the -msse
5 * switch used to compile this source. Without them, xmmintrin.h fails to
10 #include <xmmintrin.h>
18 #include "alAuxEffectSlot.h"
19 #include "mixer_defs.h"
22 static inline void ApplyCoeffsStep(ALuint Offset
, ALfloat (*restrict Values
)[2],
24 ALfloat (*restrict Coeffs
)[2],
25 const ALfloat (*restrict CoeffStep
)[2],
26 ALfloat left
, ALfloat right
)
28 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
29 __m128 coeffs
, deltas
, imp0
, imp1
;
30 __m128 vals
= _mm_setzero_ps();
35 const ALuint o0
= Offset
&HRIR_MASK
;
36 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
38 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
39 deltas
= _mm_load_ps(&CoeffStep
[0][0]);
40 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
41 imp0
= _mm_mul_ps(lrlr
, coeffs
);
42 coeffs
= _mm_add_ps(coeffs
, deltas
);
43 vals
= _mm_add_ps(imp0
, vals
);
44 _mm_store_ps(&Coeffs
[0][0], coeffs
);
45 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
46 for(i
= 1;i
< IrSize
-1;i
+= 2)
48 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
50 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
51 deltas
= _mm_load_ps(&CoeffStep
[i
+1][0]);
52 vals
= _mm_load_ps(&Values
[o2
][0]);
53 imp1
= _mm_mul_ps(lrlr
, coeffs
);
54 coeffs
= _mm_add_ps(coeffs
, deltas
);
55 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
56 vals
= _mm_add_ps(imp0
, vals
);
57 _mm_store_ps(&Coeffs
[i
+1][0], coeffs
);
58 _mm_store_ps(&Values
[o2
][0], vals
);
61 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
62 imp0
= _mm_movehl_ps(imp0
, imp0
);
63 vals
= _mm_add_ps(imp0
, vals
);
64 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
68 for(i
= 0;i
< IrSize
;i
+= 2)
70 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
72 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
73 deltas
= _mm_load_ps(&CoeffStep
[i
][0]);
74 vals
= _mm_load_ps(&Values
[o
][0]);
75 imp0
= _mm_mul_ps(lrlr
, coeffs
);
76 coeffs
= _mm_add_ps(coeffs
, deltas
);
77 vals
= _mm_add_ps(imp0
, vals
);
78 _mm_store_ps(&Coeffs
[i
][0], coeffs
);
79 _mm_store_ps(&Values
[o
][0], vals
);
84 static inline void ApplyCoeffs(ALuint Offset
, ALfloat (*restrict Values
)[2],
86 ALfloat (*restrict Coeffs
)[2],
87 ALfloat left
, ALfloat right
)
89 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
90 __m128 vals
= _mm_setzero_ps();
96 const ALuint o0
= Offset
&HRIR_MASK
;
97 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
100 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
101 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
102 imp0
= _mm_mul_ps(lrlr
, coeffs
);
103 vals
= _mm_add_ps(imp0
, vals
);
104 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
105 for(i
= 1;i
< IrSize
-1;i
+= 2)
107 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
109 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
110 vals
= _mm_load_ps(&Values
[o2
][0]);
111 imp1
= _mm_mul_ps(lrlr
, coeffs
);
112 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
113 vals
= _mm_add_ps(imp0
, vals
);
114 _mm_store_ps(&Values
[o2
][0], vals
);
117 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
118 imp0
= _mm_movehl_ps(imp0
, imp0
);
119 vals
= _mm_add_ps(imp0
, vals
);
120 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
124 for(i
= 0;i
< IrSize
;i
+= 2)
126 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
128 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
129 vals
= _mm_load_ps(&Values
[o
][0]);
130 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
131 _mm_store_ps(&Values
[o
][0], vals
);
137 #include "mixer_inc.c"
141 void Mix_SSE(const ALfloat
*data
, ALuint OutChans
, ALfloat (*restrict OutBuffer
)[BUFFERSIZE
],
142 MixGains
*Gains
, ALuint Counter
, ALuint OutPos
, ALuint BufferSize
)
148 for(c
= 0;c
< OutChans
;c
++)
151 gain
= Gains
[c
].Current
;
152 step
= Gains
[c
].Step
;
153 if(step
!= 1.0f
&& Counter
> 0)
155 /* Mix with applying gain steps in aligned multiples of 4. */
156 if(BufferSize
-pos
> 3 && Counter
-pos
> 3)
162 gain
* step
* step
* step
164 step4
= _mm_set1_ps(step
* step
* step
* step
);
166 const __m128 val4
= _mm_load_ps(&data
[pos
]);
167 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
168 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
169 gain4
= _mm_mul_ps(gain4
, step4
);
170 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
172 } while(BufferSize
-pos
> 3 && Counter
-pos
> 3);
173 gain
= _mm_cvtss_f32(gain4
);
175 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
176 for(;pos
< BufferSize
&& pos
< Counter
;pos
++)
178 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
182 gain
= Gains
[c
].Target
;
183 Gains
[c
].Current
= gain
;
184 /* Mix until pos is aligned with 4 or the mix is done. */
185 for(;pos
< BufferSize
&& (pos
&3) != 0;pos
++)
186 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
189 if(!(gain
> GAIN_SILENCE_THRESHOLD
))
191 gain4
= _mm_set1_ps(gain
);
192 for(;BufferSize
-pos
> 3;pos
+= 4)
194 const __m128 val4
= _mm_load_ps(&data
[pos
]);
195 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
196 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
197 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
199 for(;pos
< BufferSize
;pos
++)
200 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;