3 #ifdef HAVE_XMMINTRIN_H
5 /* KDevelop's parser won't recognize these defines that get added by the -msse
6 * switch used to compile this source. Without them, xmmintrin.h fails to
11 #include <xmmintrin.h>
20 #include "alAuxEffectSlot.h"
21 #include "mixer_defs.h"
24 static inline void ApplyCoeffsStep(ALuint Offset
, ALfloat (*restrict Values
)[2],
26 ALfloat (*restrict Coeffs
)[2],
27 const ALfloat (*restrict CoeffStep
)[2],
28 ALfloat left
, ALfloat right
)
30 const __m128 lrlr
= { left
, right
, left
, right
};
31 __m128 coeffs
, deltas
, imp0
, imp1
;
32 __m128 vals
= _mm_setzero_ps();
37 const ALuint o0
= Offset
&HRIR_MASK
;
38 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
40 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
41 deltas
= _mm_load_ps(&CoeffStep
[0][0]);
42 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
43 imp0
= _mm_mul_ps(lrlr
, coeffs
);
44 coeffs
= _mm_add_ps(coeffs
, deltas
);
45 vals
= _mm_add_ps(imp0
, vals
);
46 _mm_store_ps(&Coeffs
[0][0], coeffs
);
47 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
48 for(i
= 1;i
< IrSize
-1;i
+= 2)
50 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
52 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
53 deltas
= _mm_load_ps(&CoeffStep
[i
+1][0]);
54 vals
= _mm_load_ps(&Values
[o2
][0]);
55 imp1
= _mm_mul_ps(lrlr
, coeffs
);
56 coeffs
= _mm_add_ps(coeffs
, deltas
);
57 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
58 vals
= _mm_add_ps(imp0
, vals
);
59 _mm_store_ps(&Coeffs
[i
+1][0], coeffs
);
60 _mm_store_ps(&Values
[o2
][0], vals
);
63 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
64 imp0
= _mm_movehl_ps(imp0
, imp0
);
65 vals
= _mm_add_ps(imp0
, vals
);
66 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
70 for(i
= 0;i
< IrSize
;i
+= 2)
72 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
74 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
75 deltas
= _mm_load_ps(&CoeffStep
[i
][0]);
76 vals
= _mm_load_ps(&Values
[o
][0]);
77 imp0
= _mm_mul_ps(lrlr
, coeffs
);
78 coeffs
= _mm_add_ps(coeffs
, deltas
);
79 vals
= _mm_add_ps(imp0
, vals
);
80 _mm_store_ps(&Coeffs
[i
][0], coeffs
);
81 _mm_store_ps(&Values
[o
][0], vals
);
86 static inline void ApplyCoeffs(ALuint Offset
, ALfloat (*restrict Values
)[2],
88 ALfloat (*restrict Coeffs
)[2],
89 ALfloat left
, ALfloat right
)
91 const __m128 lrlr
= { left
, right
, left
, right
};
92 __m128 vals
= _mm_setzero_ps();
98 const ALuint o0
= Offset
&HRIR_MASK
;
99 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
102 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
103 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
104 imp0
= _mm_mul_ps(lrlr
, coeffs
);
105 vals
= _mm_add_ps(imp0
, vals
);
106 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
107 for(i
= 1;i
< IrSize
-1;i
+= 2)
109 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
111 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
112 vals
= _mm_load_ps(&Values
[o2
][0]);
113 imp1
= _mm_mul_ps(lrlr
, coeffs
);
114 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
115 vals
= _mm_add_ps(imp0
, vals
);
116 _mm_store_ps(&Values
[o2
][0], vals
);
119 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
120 imp0
= _mm_movehl_ps(imp0
, imp0
);
121 vals
= _mm_add_ps(imp0
, vals
);
122 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
126 for(i
= 0;i
< IrSize
;i
+= 2)
128 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
130 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
131 vals
= _mm_load_ps(&Values
[o
][0]);
132 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
133 _mm_store_ps(&Values
[o
][0], vals
);
139 #include "mixer_inc.c"
143 void MixDirect_SSE(const DirectParams
*params
, const ALfloat
*restrict data
, ALuint srcchan
,
144 ALuint OutPos
, ALuint SamplesToDo
, ALuint BufferSize
)
146 ALfloat (*restrict OutBuffer
)[BUFFERSIZE
] = params
->OutBuffer
;
147 ALfloat
*restrict ClickRemoval
= params
->ClickRemoval
;
148 ALfloat
*restrict PendingClicks
= params
->PendingClicks
;
154 for(c
= 0;c
< MaxChannels
;c
++)
156 DrySend
= params
->Mix
.Gains
[srcchan
][c
];
157 if(!(DrySend
> GAIN_SILENCE_THRESHOLD
))
161 ClickRemoval
[c
] -= data
[0]*DrySend
;
163 gain
= _mm_set1_ps(DrySend
);
164 for(pos
= 0;BufferSize
-pos
> 3;pos
+= 4)
166 const __m128 val4
= _mm_load_ps(&data
[pos
]);
167 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
168 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain
));
169 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
171 for(;pos
< BufferSize
;pos
++)
172 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*DrySend
;
174 if(OutPos
+pos
== SamplesToDo
)
175 PendingClicks
[c
] += data
[pos
]*DrySend
;
180 void MixSend_SSE(const SendParams
*params
, const ALfloat
*restrict data
,
181 ALuint OutPos
, ALuint SamplesToDo
, ALuint BufferSize
)
183 ALfloat (*restrict OutBuffer
)[BUFFERSIZE
] = params
->OutBuffer
;
184 ALfloat
*restrict ClickRemoval
= params
->ClickRemoval
;
185 ALfloat
*restrict PendingClicks
= params
->PendingClicks
;
190 WetGain
= params
->Gain
;
191 if(!(WetGain
> GAIN_SILENCE_THRESHOLD
))
195 ClickRemoval
[0] -= data
[0] * WetGain
;
197 gain
= _mm_set1_ps(WetGain
);
198 for(pos
= 0;BufferSize
-pos
> 3;pos
+= 4)
200 const __m128 val4
= _mm_load_ps(&data
[pos
]);
201 __m128 wet4
= _mm_load_ps(&OutBuffer
[0][OutPos
+pos
]);
202 wet4
= _mm_add_ps(wet4
, _mm_mul_ps(val4
, gain
));
203 _mm_store_ps(&OutBuffer
[0][OutPos
+pos
], wet4
);
205 for(;pos
< BufferSize
;pos
++)
206 OutBuffer
[0][OutPos
+pos
] += data
[pos
] * WetGain
;
208 if(OutPos
+pos
== SamplesToDo
)
209 PendingClicks
[0] += data
[pos
] * WetGain
;