3 #ifdef HAVE_XMMINTRIN_H
13 #include "alAuxEffectSlot.h"
14 #include "mixer_defs.h"
17 static __inline
void ApplyCoeffsStep(ALuint Offset
, ALfloat (*RESTRICT Values
)[2],
19 ALfloat (*RESTRICT Coeffs
)[2],
20 ALfloat (*RESTRICT CoeffStep
)[2],
21 ALfloat left
, ALfloat right
)
23 const __m128 lrlr
= { left
, right
, left
, right
};
24 __m128 coeffs
, deltas
, imp0
, imp1
;
25 __m128 vals
= _mm_setzero_ps();
30 const ALuint o0
= Offset
&HRIR_MASK
;
31 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
33 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
34 deltas
= _mm_load_ps(&CoeffStep
[0][0]);
35 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
36 imp0
= _mm_mul_ps(lrlr
, coeffs
);
37 coeffs
= _mm_add_ps(coeffs
, deltas
);
38 vals
= _mm_add_ps(imp0
, vals
);
39 _mm_store_ps(&Coeffs
[0][0], coeffs
);
40 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
41 for(i
= 1;i
< IrSize
-1;i
+= 2)
43 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
45 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
46 deltas
= _mm_load_ps(&CoeffStep
[i
+1][0]);
47 vals
= _mm_load_ps(&Values
[o2
][0]);
48 imp1
= _mm_mul_ps(lrlr
, coeffs
);
49 coeffs
= _mm_add_ps(coeffs
, deltas
);
50 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
51 vals
= _mm_add_ps(imp0
, vals
);
52 _mm_store_ps(&Coeffs
[i
+1][0], coeffs
);
53 _mm_store_ps(&Values
[o2
][0], vals
);
56 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
57 imp0
= _mm_movehl_ps(imp0
, imp0
);
58 vals
= _mm_add_ps(imp0
, vals
);
59 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
63 for(i
= 0;i
< IrSize
;i
+= 2)
65 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
67 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
68 deltas
= _mm_load_ps(&CoeffStep
[i
][0]);
69 vals
= _mm_load_ps(&Values
[o
][0]);
70 imp0
= _mm_mul_ps(lrlr
, coeffs
);
71 coeffs
= _mm_add_ps(coeffs
, deltas
);
72 vals
= _mm_add_ps(imp0
, vals
);
73 _mm_store_ps(&Coeffs
[i
][0], coeffs
);
74 _mm_store_ps(&Values
[o
][0], vals
);
79 static __inline
void ApplyCoeffs(ALuint Offset
, ALfloat (*RESTRICT Values
)[2],
81 ALfloat (*RESTRICT Coeffs
)[2],
82 ALfloat left
, ALfloat right
)
84 const __m128 lrlr
= { left
, right
, left
, right
};
85 __m128 vals
= _mm_setzero_ps();
91 const ALuint o0
= Offset
&HRIR_MASK
;
92 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
95 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
96 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
97 imp0
= _mm_mul_ps(lrlr
, coeffs
);
98 vals
= _mm_add_ps(imp0
, vals
);
99 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
100 for(i
= 1;i
< IrSize
-1;i
+= 2)
102 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
104 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
105 vals
= _mm_load_ps(&Values
[o2
][0]);
106 imp1
= _mm_mul_ps(lrlr
, coeffs
);
107 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
108 vals
= _mm_add_ps(imp0
, vals
);
109 _mm_store_ps(&Values
[o2
][0], vals
);
112 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
113 imp0
= _mm_movehl_ps(imp0
, imp0
);
114 vals
= _mm_add_ps(imp0
, vals
);
115 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
119 for(i
= 0;i
< IrSize
;i
+= 2)
121 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
123 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
124 vals
= _mm_load_ps(&Values
[o
][0]);
125 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
126 _mm_store_ps(&Values
[o
][0], vals
);
132 #include "mixer_inc.c"
136 void MixDirect_SSE(ALsource
*Source
, ALCdevice
*Device
, DirectParams
*params
,
137 const ALfloat
*RESTRICT data
, ALuint srcchan
,
138 ALuint OutPos
, ALuint SamplesToDo
, ALuint BufferSize
)
140 ALfloat (*RESTRICT DryBuffer
)[BUFFERSIZE
] = Device
->DryBuffer
;
141 ALfloat
*RESTRICT ClickRemoval
= Device
->ClickRemoval
;
142 ALfloat
*RESTRICT PendingClicks
= Device
->PendingClicks
;
148 for(c
= 0;c
< MaxChannels
;c
++)
152 DrySend
= params
->Gains
[srcchan
][c
];
153 if(DrySend
< 0.00001f
)
157 ClickRemoval
[c
] -= data
[0]*DrySend
;
159 gain
= _mm_set1_ps(DrySend
);
160 for(pos
= 0;pos
< BufferSize
-3;pos
+= 4)
162 const __m128 val4
= _mm_load_ps(&data
[pos
]);
163 __m128 dry4
= _mm_load_ps(&DryBuffer
[c
][OutPos
+pos
]);
164 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain
));
165 _mm_store_ps(&DryBuffer
[c
][OutPos
+pos
], dry4
);
167 for(;pos
< BufferSize
;pos
++)
168 DryBuffer
[c
][OutPos
+pos
] += data
[pos
]*DrySend
;
170 if(OutPos
+pos
== SamplesToDo
)
171 PendingClicks
[c
] += data
[pos
]*DrySend
;
176 void MixSend_SSE(SendParams
*params
, const ALfloat
*RESTRICT data
,
177 ALuint OutPos
, ALuint SamplesToDo
, ALuint BufferSize
)
179 ALeffectslot
*Slot
= params
->Slot
;
180 ALfloat (*RESTRICT WetBuffer
)[BUFFERSIZE
] = Slot
->WetBuffer
;
181 ALfloat
*RESTRICT WetClickRemoval
= Slot
->ClickRemoval
;
182 ALfloat
*RESTRICT WetPendingClicks
= Slot
->PendingClicks
;
183 const ALfloat WetGain
= params
->Gain
;
187 if(WetGain
< 0.00001f
)
191 WetClickRemoval
[0] -= data
[0] * WetGain
;
193 gain
= _mm_set1_ps(WetGain
);
194 for(pos
= 0;pos
< BufferSize
-3;pos
+=4)
196 const __m128 val4
= _mm_load_ps(&data
[pos
]);
197 __m128 wet4
= _mm_load_ps(&WetBuffer
[0][OutPos
+pos
]);
198 wet4
= _mm_add_ps(wet4
, _mm_mul_ps(val4
, gain
));
199 _mm_store_ps(&WetBuffer
[0][OutPos
+pos
], wet4
);
201 for(;pos
< BufferSize
;pos
++)
202 WetBuffer
[0][OutPos
+pos
] += data
[pos
] * WetGain
;
204 if(OutPos
+pos
== SamplesToDo
)
205 WetPendingClicks
[0] += data
[pos
] * WetGain
;