3 #ifdef HAVE_XMMINTRIN_H
13 #include "mixer_defs.h"
16 static __inline
void ApplyCoeffsStep(ALuint Offset
, ALfloat (*RESTRICT Values
)[2],
17 ALfloat (*RESTRICT Coeffs
)[2],
18 ALfloat (*RESTRICT CoeffStep
)[2],
19 ALfloat left
, ALfloat right
)
21 const __m128 lrlr
= { left
, right
, left
, right
};
22 __m128 vals
= _mm_setzero_ps();
23 __m128 coeffs
, coeffstep
;
25 for(c
= 0;c
< HRIR_LENGTH
;c
+= 2)
27 const ALuint o0
= (Offset
++)&HRIR_MASK
;
28 const ALuint o1
= (Offset
++)&HRIR_MASK
;
30 coeffs
= _mm_load_ps(&Coeffs
[c
][0]);
31 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
32 vals
= _mm_loadh_pi(vals
, (__m64
*)&Values
[o1
][0]);
34 vals
= _mm_add_ps(vals
, _mm_mul_ps(coeffs
, lrlr
));
35 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
36 _mm_storeh_pi((__m64
*)&Values
[o1
][0], vals
);
38 coeffstep
= _mm_load_ps(&CoeffStep
[c
][0]);
39 coeffs
= _mm_add_ps(coeffs
, coeffstep
);
40 _mm_store_ps(&Coeffs
[c
][0], coeffs
);
44 static __inline
void ApplyCoeffs(ALuint Offset
, ALfloat (*RESTRICT Values
)[2],
45 ALfloat (*RESTRICT Coeffs
)[2],
46 ALfloat left
, ALfloat right
)
48 const __m128 lrlr
= { left
, right
, left
, right
};
49 __m128 vals
= _mm_setzero_ps();
52 for(c
= 0;c
< HRIR_LENGTH
;c
+= 2)
54 const ALuint o0
= (Offset
++)&HRIR_MASK
;
55 const ALuint o1
= (Offset
++)&HRIR_MASK
;
57 coeffs
= _mm_load_ps(&Coeffs
[c
][0]);
58 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
59 vals
= _mm_loadh_pi(vals
, (__m64
*)&Values
[o1
][0]);
61 vals
= _mm_add_ps(vals
, _mm_mul_ps(coeffs
, lrlr
));
62 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
63 _mm_storeh_pi((__m64
*)&Values
[o1
][0], vals
);
68 void MixDirect_SSE(ALsource
*Source
, ALCdevice
*Device
, DirectParams
*params
,
69 const ALfloat
*RESTRICT data
, ALuint srcchan
,
70 ALuint OutPos
, ALuint SamplesToDo
, ALuint BufferSize
)
72 ALfloat (*RESTRICT DryBuffer
)[MaxChannels
];
73 ALfloat
*RESTRICT ClickRemoval
, *RESTRICT PendingClicks
;
74 ALIGN(16) ALfloat DrySend
[MaxChannels
];
75 ALIGN(16) ALfloat value
[4];
81 DryBuffer
= Device
->DryBuffer
;
82 ClickRemoval
= Device
->ClickRemoval
;
83 PendingClicks
= Device
->PendingClicks
;
84 DryFilter
= ¶ms
->iirFilter
;
86 for(c
= 0;c
< MaxChannels
;c
++)
87 DrySend
[c
] = params
->Gains
[srcchan
][c
];
92 value
[0] = lpFilter2PC(DryFilter
, srcchan
, data
[pos
]);
93 for(c
= 0;c
< MaxChannels
;c
++)
94 ClickRemoval
[c
] -= value
[0]*DrySend
[c
];
96 for(pos
= 0;pos
< BufferSize
-3;pos
+= 4)
100 value
[0] = lpFilter2P(DryFilter
, srcchan
, data
[pos
]);
101 value
[1] = lpFilter2P(DryFilter
, srcchan
, data
[pos
+1]);
102 value
[2] = lpFilter2P(DryFilter
, srcchan
, data
[pos
+2]);
103 value
[3] = lpFilter2P(DryFilter
, srcchan
, data
[pos
+3]);
104 val4
= _mm_load_ps(value
);
106 for(c
= 0;c
< MaxChannels
;c
++)
108 const __m128 gain
= _mm_set1_ps(DrySend
[c
]);
111 value
[0] = DryBuffer
[OutPos
][c
];
112 value
[1] = DryBuffer
[OutPos
+1][c
];
113 value
[2] = DryBuffer
[OutPos
+2][c
];
114 value
[3] = DryBuffer
[OutPos
+3][c
];
115 dry4
= _mm_load_ps(value
);
117 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain
));
119 _mm_store_ps(value
, dry4
);
120 DryBuffer
[OutPos
][c
] = value
[0];
121 DryBuffer
[OutPos
+1][c
] = value
[1];
122 DryBuffer
[OutPos
+2][c
] = value
[2];
123 DryBuffer
[OutPos
+3][c
] = value
[3];
128 for(;pos
< BufferSize
;pos
++)
130 value
[0] = lpFilter2P(DryFilter
, srcchan
, data
[pos
]);
131 for(c
= 0;c
< MaxChannels
;c
++)
132 DryBuffer
[OutPos
][c
] += value
[0]*DrySend
[c
];
135 if(OutPos
== SamplesToDo
)
137 value
[0] = lpFilter2PC(DryFilter
, srcchan
, data
[pos
]);
138 for(c
= 0;c
< MaxChannels
;c
++)
139 PendingClicks
[c
] += value
[0]*DrySend
[c
];
146 #include "mixer_inc.c"