11 #include "alAuxEffectSlot.h"
12 #include "mixer_defs.h"
15 static inline void SetupCoeffs(ALfloat (*restrict OutCoeffs
)[2],
16 const HrtfParams
*hrtfparams
,
17 ALuint IrSize
, ALuint Counter
)
19 const __m128 counter4
= _mm_set1_ps((float)Counter
);
23 for(i
= 0;i
< IrSize
;i
+= 2)
25 step4
= _mm_load_ps(&hrtfparams
->CoeffStep
[i
][0]);
26 coeffs
= _mm_load_ps(&hrtfparams
->Coeffs
[i
][0]);
27 coeffs
= _mm_sub_ps(coeffs
, _mm_mul_ps(step4
, counter4
));
28 _mm_store_ps(&OutCoeffs
[i
][0], coeffs
);
32 static inline void ApplyCoeffsStep(ALuint Offset
, ALfloat (*restrict Values
)[2],
34 ALfloat (*restrict Coeffs
)[2],
35 const ALfloat (*restrict CoeffStep
)[2],
36 ALfloat left
, ALfloat right
)
38 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
39 __m128 coeffs
, deltas
, imp0
, imp1
;
40 __m128 vals
= _mm_setzero_ps();
45 const ALuint o0
= Offset
&HRIR_MASK
;
46 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
48 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
49 deltas
= _mm_load_ps(&CoeffStep
[0][0]);
50 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
51 imp0
= _mm_mul_ps(lrlr
, coeffs
);
52 coeffs
= _mm_add_ps(coeffs
, deltas
);
53 vals
= _mm_add_ps(imp0
, vals
);
54 _mm_store_ps(&Coeffs
[0][0], coeffs
);
55 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
56 for(i
= 1;i
< IrSize
-1;i
+= 2)
58 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
60 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
61 deltas
= _mm_load_ps(&CoeffStep
[i
+1][0]);
62 vals
= _mm_load_ps(&Values
[o2
][0]);
63 imp1
= _mm_mul_ps(lrlr
, coeffs
);
64 coeffs
= _mm_add_ps(coeffs
, deltas
);
65 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
66 vals
= _mm_add_ps(imp0
, vals
);
67 _mm_store_ps(&Coeffs
[i
+1][0], coeffs
);
68 _mm_store_ps(&Values
[o2
][0], vals
);
71 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
72 imp0
= _mm_movehl_ps(imp0
, imp0
);
73 vals
= _mm_add_ps(imp0
, vals
);
74 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
78 for(i
= 0;i
< IrSize
;i
+= 2)
80 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
82 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
83 deltas
= _mm_load_ps(&CoeffStep
[i
][0]);
84 vals
= _mm_load_ps(&Values
[o
][0]);
85 imp0
= _mm_mul_ps(lrlr
, coeffs
);
86 coeffs
= _mm_add_ps(coeffs
, deltas
);
87 vals
= _mm_add_ps(imp0
, vals
);
88 _mm_store_ps(&Coeffs
[i
][0], coeffs
);
89 _mm_store_ps(&Values
[o
][0], vals
);
94 static inline void ApplyCoeffs(ALuint Offset
, ALfloat (*restrict Values
)[2],
96 ALfloat (*restrict Coeffs
)[2],
97 ALfloat left
, ALfloat right
)
99 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
100 __m128 vals
= _mm_setzero_ps();
106 const ALuint o0
= Offset
&HRIR_MASK
;
107 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
110 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
111 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
112 imp0
= _mm_mul_ps(lrlr
, coeffs
);
113 vals
= _mm_add_ps(imp0
, vals
);
114 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
115 for(i
= 1;i
< IrSize
-1;i
+= 2)
117 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
119 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
120 vals
= _mm_load_ps(&Values
[o2
][0]);
121 imp1
= _mm_mul_ps(lrlr
, coeffs
);
122 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
123 vals
= _mm_add_ps(imp0
, vals
);
124 _mm_store_ps(&Values
[o2
][0], vals
);
127 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
128 imp0
= _mm_movehl_ps(imp0
, imp0
);
129 vals
= _mm_add_ps(imp0
, vals
);
130 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
134 for(i
= 0;i
< IrSize
;i
+= 2)
136 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
138 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
139 vals
= _mm_load_ps(&Values
[o
][0]);
140 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
141 _mm_store_ps(&Values
[o
][0], vals
);
147 #include "mixer_inc.c"
151 void Mix_SSE(const ALfloat
*data
, ALuint OutChans
, ALfloat (*restrict OutBuffer
)[BUFFERSIZE
],
152 MixGains
*Gains
, ALuint Counter
, ALuint OutPos
, ALuint BufferSize
)
158 for(c
= 0;c
< OutChans
;c
++)
161 gain
= Gains
[c
].Current
;
162 step
= Gains
[c
].Step
;
163 if(step
!= 0.0f
&& Counter
> 0)
165 /* Mix with applying gain steps in aligned multiples of 4. */
166 if(BufferSize
-pos
> 3 && Counter
-pos
> 3)
172 gain
+ step
+ step
+ step
174 step4
= _mm_set1_ps(step
+ step
+ step
+ step
);
176 const __m128 val4
= _mm_load_ps(&data
[pos
]);
177 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
178 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
179 gain4
= _mm_add_ps(gain4
, step4
);
180 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
182 } while(BufferSize
-pos
> 3 && Counter
-pos
> 3);
183 gain
= _mm_cvtss_f32(gain4
);
185 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
186 for(;pos
< BufferSize
&& pos
< Counter
;pos
++)
188 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
192 gain
= Gains
[c
].Target
;
193 Gains
[c
].Current
= gain
;
194 /* Mix until pos is aligned with 4 or the mix is done. */
195 for(;pos
< BufferSize
&& (pos
&3) != 0;pos
++)
196 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
199 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
201 gain4
= _mm_set1_ps(gain
);
202 for(;BufferSize
-pos
> 3;pos
+= 4)
204 const __m128 val4
= _mm_load_ps(&data
[pos
]);
205 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
206 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
207 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
209 for(;pos
< BufferSize
;pos
++)
210 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;