3 #ifdef HAVE_XMMINTRIN_H
13 #include "mixer_defs.h"
15 static __inline ALfloat
lerp32(const ALfloat
*vals
, ALint step
, ALuint frac
)
16 { return lerp(vals
[0], vals
[step
], frac
* (1.0f
/FRACTIONONE
)); }
18 void Resample_lerp32_SSE(const ALfloat
*data
, ALuint frac
,
19 ALuint increment
, ALuint NumChannels
, ALfloat
*RESTRICT OutBuffer
,
22 ALIGN(16) float value
[3][4];
26 for(i
= 0;i
< BufferSize
+1-3;i
+=4)
31 value
[0][j
] = data
[(pos
)*NumChannels
];
32 value
[1][j
] = data
[(pos
+1)*NumChannels
];
33 value
[2][j
] = frac
* (1.0f
/FRACTIONONE
);
36 pos
+= frac
>>FRACTIONBITS
;
40 x
= _mm_load_ps(value
[0]);
41 y
= _mm_load_ps(value
[1]);
44 a
= _mm_load_ps(value
[2]);
49 _mm_store_ps(&OutBuffer
[i
], x
);
51 for(;i
< BufferSize
+1;i
++)
53 OutBuffer
[i
] = lerp32(data
+ pos
*NumChannels
, NumChannels
, frac
);
56 pos
+= frac
>>FRACTIONBITS
;
63 static __inline
void ApplyCoeffsStep(ALuint Offset
, ALfloat (*RESTRICT Values
)[2],
65 ALfloat (*RESTRICT Coeffs
)[2],
66 ALfloat (*RESTRICT CoeffStep
)[2],
67 ALfloat left
, ALfloat right
)
69 const __m128 lrlr
= { left
, right
, left
, right
};
70 __m128 coeffs
, deltas
, imp0
, imp1
;
71 __m128 vals
= _mm_setzero_ps();
76 const ALuint o0
= Offset
&HRIR_MASK
;
77 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
79 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
80 deltas
= _mm_load_ps(&CoeffStep
[0][0]);
81 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
82 imp0
= _mm_mul_ps(lrlr
, coeffs
);
83 coeffs
= _mm_add_ps(coeffs
, deltas
);
84 vals
= _mm_add_ps(imp0
, vals
);
85 _mm_store_ps(&Coeffs
[0][0], coeffs
);
86 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
87 for(i
= 1;i
< IrSize
-1;i
+= 2)
89 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
91 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
92 deltas
= _mm_load_ps(&CoeffStep
[i
+1][0]);
93 vals
= _mm_load_ps(&Values
[o2
][0]);
94 imp1
= _mm_mul_ps(lrlr
, coeffs
);
95 coeffs
= _mm_add_ps(coeffs
, deltas
);
96 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
97 vals
= _mm_add_ps(imp0
, vals
);
98 _mm_store_ps(&Coeffs
[i
+1][0], coeffs
);
99 _mm_store_ps(&Values
[o2
][0], vals
);
102 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
103 imp0
= _mm_movehl_ps(imp0
, imp0
);
104 vals
= _mm_add_ps(imp0
, vals
);
105 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
109 for(i
= 0;i
< IrSize
;i
+= 2)
111 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
113 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
114 deltas
= _mm_load_ps(&CoeffStep
[i
][0]);
115 vals
= _mm_load_ps(&Values
[o
][0]);
116 imp0
= _mm_mul_ps(lrlr
, coeffs
);
117 coeffs
= _mm_add_ps(coeffs
, deltas
);
118 vals
= _mm_add_ps(imp0
, vals
);
119 _mm_store_ps(&Coeffs
[i
][0], coeffs
);
120 _mm_store_ps(&Values
[o
][0], vals
);
125 static __inline
void ApplyCoeffs(ALuint Offset
, ALfloat (*RESTRICT Values
)[2],
127 ALfloat (*RESTRICT Coeffs
)[2],
128 ALfloat left
, ALfloat right
)
130 const __m128 lrlr
= { left
, right
, left
, right
};
131 __m128 vals
= _mm_setzero_ps();
137 const ALuint o0
= Offset
&HRIR_MASK
;
138 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
141 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
142 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
143 imp0
= _mm_mul_ps(lrlr
, coeffs
);
144 vals
= _mm_add_ps(imp0
, vals
);
145 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
146 for(i
= 1;i
< IrSize
-1;i
+= 2)
148 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
150 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
151 vals
= _mm_load_ps(&Values
[o2
][0]);
152 imp1
= _mm_mul_ps(lrlr
, coeffs
);
153 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
154 vals
= _mm_add_ps(imp0
, vals
);
155 _mm_store_ps(&Values
[o2
][0], vals
);
158 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
159 imp0
= _mm_movehl_ps(imp0
, imp0
);
160 vals
= _mm_add_ps(imp0
, vals
);
161 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
165 for(i
= 0;i
< IrSize
;i
+= 2)
167 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
169 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
170 vals
= _mm_load_ps(&Values
[o
][0]);
171 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
172 _mm_store_ps(&Values
[o
][0], vals
);
178 void MixDirect_SSE(ALsource
*Source
, ALCdevice
*Device
, DirectParams
*params
,
179 const ALfloat
*RESTRICT data
, ALuint srcchan
,
180 ALuint OutPos
, ALuint SamplesToDo
, ALuint BufferSize
)
182 ALfloat (*RESTRICT DryBuffer
)[BUFFERSIZE
] = Device
->DryBuffer
;
183 ALfloat
*RESTRICT ClickRemoval
= Device
->ClickRemoval
;
184 ALfloat
*RESTRICT PendingClicks
= Device
->PendingClicks
;
185 ALfloat DrySend
[MaxChannels
];
190 for(c
= 0;c
< MaxChannels
;c
++)
191 DrySend
[c
] = params
->Gains
[srcchan
][c
];
196 for(c
= 0;c
< MaxChannels
;c
++)
197 ClickRemoval
[c
] -= data
[pos
]*DrySend
[c
];
199 for(c
= 0;c
< MaxChannels
;c
++)
201 const __m128 gain
= _mm_set1_ps(DrySend
[c
]);
202 for(pos
= 0;pos
< BufferSize
-3;pos
+= 4)
204 const __m128 val4
= _mm_load_ps(&data
[pos
]);
205 __m128 dry4
= _mm_load_ps(&DryBuffer
[c
][OutPos
+pos
]);
206 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain
));
207 _mm_store_ps(&DryBuffer
[c
][OutPos
+pos
], dry4
);
213 for(c
= 0;c
< MaxChannels
;c
++)
216 for(;pos
< BufferSize
;pos
++)
217 DryBuffer
[c
][OutPos
+pos
] += data
[pos
]*DrySend
[c
];
220 if(OutPos
+pos
== SamplesToDo
)
222 for(c
= 0;c
< MaxChannels
;c
++)
223 PendingClicks
[c
] += data
[pos
]*DrySend
[c
];
230 #include "mixer_inc.c"