3 #ifdef HAVE_XMMINTRIN_H
13 #include "mixer_defs.h"
16 static __inline
void ApplyCoeffsStep(ALuint Offset
, ALfloat (*RESTRICT Values
)[2],
18 ALfloat (*RESTRICT Coeffs
)[2],
19 ALfloat (*RESTRICT CoeffStep
)[2],
20 ALfloat left
, ALfloat right
)
22 const __m128 lrlr
= { left
, right
, left
, right
};
23 __m128 coeffs
, deltas
, imp0
, imp1
;
24 __m128 vals
= _mm_setzero_ps();
29 const ALuint o0
= Offset
&HRIR_MASK
;
30 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
32 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
33 deltas
= _mm_load_ps(&CoeffStep
[0][0]);
34 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
35 imp0
= _mm_mul_ps(lrlr
, coeffs
);
36 coeffs
= _mm_add_ps(coeffs
, deltas
);
37 vals
= _mm_add_ps(imp0
, vals
);
38 _mm_store_ps(&Coeffs
[0][0], coeffs
);
39 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
40 for(i
= 1;i
< IrSize
-1;i
+= 2)
42 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
44 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
45 deltas
= _mm_load_ps(&CoeffStep
[i
+1][0]);
46 vals
= _mm_load_ps(&Values
[o2
][0]);
47 imp1
= _mm_mul_ps(lrlr
, coeffs
);
48 coeffs
= _mm_add_ps(coeffs
, deltas
);
49 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
50 vals
= _mm_add_ps(imp0
, vals
);
51 _mm_store_ps(&Coeffs
[i
+1][0], coeffs
);
52 _mm_store_ps(&Values
[o2
][0], vals
);
55 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
56 imp0
= _mm_movehl_ps(imp0
, imp0
);
57 vals
= _mm_add_ps(imp0
, vals
);
58 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
62 for(i
= 0;i
< IrSize
;i
+= 2)
64 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
66 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
67 deltas
= _mm_load_ps(&CoeffStep
[i
][0]);
68 vals
= _mm_load_ps(&Values
[o
][0]);
69 imp0
= _mm_mul_ps(lrlr
, coeffs
);
70 coeffs
= _mm_add_ps(coeffs
, deltas
);
71 vals
= _mm_add_ps(imp0
, vals
);
72 _mm_store_ps(&Coeffs
[i
][0], coeffs
);
73 _mm_store_ps(&Values
[o
][0], vals
);
78 static __inline
void ApplyCoeffs(ALuint Offset
, ALfloat (*RESTRICT Values
)[2],
80 ALfloat (*RESTRICT Coeffs
)[2],
81 ALfloat left
, ALfloat right
)
83 const __m128 lrlr
= { left
, right
, left
, right
};
84 __m128 vals
= _mm_setzero_ps();
90 const ALuint o0
= Offset
&HRIR_MASK
;
91 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
94 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
95 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
96 imp0
= _mm_mul_ps(lrlr
, coeffs
);
97 vals
= _mm_add_ps(imp0
, vals
);
98 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
99 for(i
= 1;i
< IrSize
-1;i
+= 2)
101 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
103 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
104 vals
= _mm_load_ps(&Values
[o2
][0]);
105 imp1
= _mm_mul_ps(lrlr
, coeffs
);
106 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
107 vals
= _mm_add_ps(imp0
, vals
);
108 _mm_store_ps(&Values
[o2
][0], vals
);
111 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
112 imp0
= _mm_movehl_ps(imp0
, imp0
);
113 vals
= _mm_add_ps(imp0
, vals
);
114 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
118 for(i
= 0;i
< IrSize
;i
+= 2)
120 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
122 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
123 vals
= _mm_load_ps(&Values
[o
][0]);
124 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
125 _mm_store_ps(&Values
[o
][0], vals
);
131 void MixDirect_SSE(ALsource
*Source
, ALCdevice
*Device
, DirectParams
*params
,
132 const ALfloat
*RESTRICT data
, ALuint srcchan
,
133 ALuint OutPos
, ALuint SamplesToDo
, ALuint BufferSize
)
135 ALfloat (*RESTRICT DryBuffer
)[BUFFERSIZE
] = Device
->DryBuffer
;
136 ALfloat
*RESTRICT ClickRemoval
= Device
->ClickRemoval
;
137 ALfloat
*RESTRICT PendingClicks
= Device
->PendingClicks
;
138 ALfloat DrySend
[MaxChannels
];
143 for(c
= 0;c
< MaxChannels
;c
++)
144 DrySend
[c
] = params
->Gains
[srcchan
][c
];
149 for(c
= 0;c
< MaxChannels
;c
++)
150 ClickRemoval
[c
] -= data
[pos
]*DrySend
[c
];
152 for(c
= 0;c
< MaxChannels
;c
++)
154 const __m128 gain
= _mm_set1_ps(DrySend
[c
]);
155 for(pos
= 0;pos
< BufferSize
-3;pos
+= 4)
157 const __m128 val4
= _mm_load_ps(&data
[pos
]);
158 __m128 dry4
= _mm_load_ps(&DryBuffer
[c
][OutPos
+pos
]);
159 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain
));
160 _mm_store_ps(&DryBuffer
[c
][OutPos
+pos
], dry4
);
166 for(c
= 0;c
< MaxChannels
;c
++)
169 for(;pos
< BufferSize
;pos
++)
170 DryBuffer
[c
][OutPos
+pos
] += data
[pos
]*DrySend
[c
];
173 if(OutPos
+pos
== SamplesToDo
)
175 for(c
= 0;c
< MaxChannels
;c
++)
176 PendingClicks
[c
] += data
[pos
]*DrySend
[c
];
183 #include "mixer_inc.c"