3 #ifdef HAVE_XMMINTRIN_H
13 #include "mixer_defs.h"
16 static __inline
void ApplyCoeffsStep(ALuint Offset
, ALfloat (*RESTRICT Values
)[2],
18 ALfloat (*RESTRICT Coeffs
)[2],
19 ALfloat (*RESTRICT CoeffStep
)[2],
20 ALfloat left
, ALfloat right
)
22 const __m128 lrlr
= { left
, right
, left
, right
};
23 __m128 coeffs
, deltas
, imp0
, imp1
;
24 __m128 vals
= _mm_setzero_ps();
29 const ALuint o0
= Offset
&HRIR_MASK
;
30 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
32 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
33 deltas
= _mm_load_ps(&CoeffStep
[0][0]);
34 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
35 imp0
= _mm_mul_ps(lrlr
, coeffs
);
36 coeffs
= _mm_add_ps(coeffs
, deltas
);
37 vals
= _mm_add_ps(imp0
, vals
);
38 _mm_store_ps(&Coeffs
[0][0], coeffs
);
39 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
40 for(i
= 1;i
< IrSize
-1;i
+= 2)
42 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
44 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
45 deltas
= _mm_load_ps(&CoeffStep
[i
+1][0]);
46 vals
= _mm_load_ps(&Values
[o2
][0]);
47 imp1
= _mm_mul_ps(lrlr
, coeffs
);
48 coeffs
= _mm_add_ps(coeffs
, deltas
);
49 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
50 vals
= _mm_add_ps(imp0
, vals
);
51 _mm_store_ps(&Coeffs
[i
+1][0], coeffs
);
52 _mm_store_ps(&Values
[o2
][0], vals
);
55 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
56 imp0
= _mm_movehl_ps(imp0
, imp0
);
57 vals
= _mm_add_ps(imp0
, vals
);
58 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
62 for(i
= 0;i
< IrSize
;i
+= 2)
64 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
66 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
67 deltas
= _mm_load_ps(&CoeffStep
[i
][0]);
68 vals
= _mm_load_ps(&Values
[o
][0]);
69 imp0
= _mm_mul_ps(lrlr
, coeffs
);
70 coeffs
= _mm_add_ps(coeffs
, deltas
);
71 vals
= _mm_add_ps(imp0
, vals
);
72 _mm_store_ps(&Coeffs
[i
][0], coeffs
);
73 _mm_store_ps(&Values
[o
][0], vals
);
78 static __inline
void ApplyCoeffs(ALuint Offset
, ALfloat (*RESTRICT Values
)[2],
80 ALfloat (*RESTRICT Coeffs
)[2],
81 ALfloat left
, ALfloat right
)
83 const __m128 lrlr
= { left
, right
, left
, right
};
84 __m128 vals
= _mm_setzero_ps();
90 const ALuint o0
= Offset
&HRIR_MASK
;
91 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
94 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
95 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
96 imp0
= _mm_mul_ps(lrlr
, coeffs
);
97 vals
= _mm_add_ps(imp0
, vals
);
98 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
99 for(i
= 1;i
< IrSize
-1;i
+= 2)
101 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
103 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
104 vals
= _mm_load_ps(&Values
[o2
][0]);
105 imp1
= _mm_mul_ps(lrlr
, coeffs
);
106 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
107 vals
= _mm_add_ps(imp0
, vals
);
108 _mm_store_ps(&Values
[o2
][0], vals
);
111 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
112 imp0
= _mm_movehl_ps(imp0
, imp0
);
113 vals
= _mm_add_ps(imp0
, vals
);
114 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
118 for(i
= 0;i
< IrSize
;i
+= 2)
120 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
122 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
123 vals
= _mm_load_ps(&Values
[o
][0]);
124 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
125 _mm_store_ps(&Values
[o
][0], vals
);
131 void MixDirect_SSE(ALsource
*Source
, ALCdevice
*Device
, DirectParams
*params
,
132 const ALfloat
*RESTRICT data
, ALuint srcchan
,
133 ALuint OutPos
, ALuint SamplesToDo
, ALuint BufferSize
)
135 ALfloat (*RESTRICT DryBuffer
)[MaxChannels
];
136 ALfloat
*RESTRICT ClickRemoval
, *RESTRICT PendingClicks
;
137 ALIGN(16) ALfloat DrySend
[MaxChannels
];
138 ALIGN(16) ALfloat value
[4];
144 DryBuffer
= Device
->DryBuffer
;
145 ClickRemoval
= Device
->ClickRemoval
;
146 PendingClicks
= Device
->PendingClicks
;
147 DryFilter
= ¶ms
->iirFilter
;
149 for(c
= 0;c
< MaxChannels
;c
++)
150 DrySend
[c
] = params
->Gains
[srcchan
][c
];
155 value
[0] = lpFilter2PC(DryFilter
, srcchan
, data
[pos
]);
156 for(c
= 0;c
< MaxChannels
;c
++)
157 ClickRemoval
[c
] -= value
[0]*DrySend
[c
];
159 for(pos
= 0;pos
< BufferSize
-3;pos
+= 4)
163 value
[0] = lpFilter2P(DryFilter
, srcchan
, data
[pos
]);
164 value
[1] = lpFilter2P(DryFilter
, srcchan
, data
[pos
+1]);
165 value
[2] = lpFilter2P(DryFilter
, srcchan
, data
[pos
+2]);
166 value
[3] = lpFilter2P(DryFilter
, srcchan
, data
[pos
+3]);
167 val4
= _mm_load_ps(value
);
169 for(c
= 0;c
< MaxChannels
;c
++)
171 const __m128 gain
= _mm_set1_ps(DrySend
[c
]);
174 value
[0] = DryBuffer
[OutPos
][c
];
175 value
[1] = DryBuffer
[OutPos
+1][c
];
176 value
[2] = DryBuffer
[OutPos
+2][c
];
177 value
[3] = DryBuffer
[OutPos
+3][c
];
178 dry4
= _mm_load_ps(value
);
180 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain
));
182 _mm_store_ps(value
, dry4
);
183 DryBuffer
[OutPos
][c
] = value
[0];
184 DryBuffer
[OutPos
+1][c
] = value
[1];
185 DryBuffer
[OutPos
+2][c
] = value
[2];
186 DryBuffer
[OutPos
+3][c
] = value
[3];
191 for(;pos
< BufferSize
;pos
++)
193 value
[0] = lpFilter2P(DryFilter
, srcchan
, data
[pos
]);
194 for(c
= 0;c
< MaxChannels
;c
++)
195 DryBuffer
[OutPos
][c
] += value
[0]*DrySend
[c
];
198 if(OutPos
== SamplesToDo
)
200 value
[0] = lpFilter2PC(DryFilter
, srcchan
, data
[pos
]);
201 for(c
= 0;c
< MaxChannels
;c
++)
202 PendingClicks
[c
] += value
[0]*DrySend
[c
];
209 #include "mixer_inc.c"