4 /* KDevelop's parser won't recognize these defines that get added by the -msse
5 * switch used to compile this source. Without them, xmmintrin.h fails to
10 #include <xmmintrin.h>
18 #include "alAuxEffectSlot.h"
19 #include "mixer_defs.h"
22 static inline void ApplyCoeffs(ALuint Offset
, ALfloat (*restrict Values
)[2],
24 ALfloat (*restrict Coeffs
)[2],
25 ALfloat left
, ALfloat right
)
27 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
28 __m128 vals
= _mm_setzero_ps();
34 const ALuint o0
= Offset
&HRIR_MASK
;
35 const ALuint o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
38 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
39 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
40 imp0
= _mm_mul_ps(lrlr
, coeffs
);
41 vals
= _mm_add_ps(imp0
, vals
);
42 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
43 for(i
= 1;i
< IrSize
-1;i
+= 2)
45 const ALuint o2
= (Offset
+i
)&HRIR_MASK
;
47 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
48 vals
= _mm_load_ps(&Values
[o2
][0]);
49 imp1
= _mm_mul_ps(lrlr
, coeffs
);
50 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
51 vals
= _mm_add_ps(imp0
, vals
);
52 _mm_store_ps(&Values
[o2
][0], vals
);
55 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
56 imp0
= _mm_movehl_ps(imp0
, imp0
);
57 vals
= _mm_add_ps(imp0
, vals
);
58 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
62 for(i
= 0;i
< IrSize
;i
+= 2)
64 const ALuint o
= (Offset
+ i
)&HRIR_MASK
;
66 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
67 vals
= _mm_load_ps(&Values
[o
][0]);
68 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
69 _mm_store_ps(&Values
[o
][0], vals
);
75 #include "mixer_inc.c"
79 void Mix_SSE(const ALfloat
*data
, ALuint OutChans
, ALfloat (*restrict OutBuffer
)[BUFFERSIZE
],
80 MixGains
*Gains
, ALuint Counter
, ALuint OutPos
, ALuint BufferSize
)
86 for(c
= 0;c
< OutChans
;c
++)
89 gain
= Gains
[c
].Current
;
91 if(step
!= 1.0f
&& Counter
> 0)
93 /* Mix with applying gain steps in aligned multiples of 4. */
94 if(BufferSize
-pos
> 3 && Counter
-pos
> 3)
100 gain
* step
* step
* step
102 step4
= _mm_set1_ps(step
* step
* step
* step
);
104 const __m128 val4
= _mm_load_ps(&data
[pos
]);
105 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
106 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
107 gain4
= _mm_mul_ps(gain4
, step4
);
108 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
110 } while(BufferSize
-pos
> 3 && Counter
-pos
> 3);
111 gain
= _mm_cvtss_f32(gain4
);
113 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
114 for(;pos
< BufferSize
&& pos
< Counter
;pos
++)
116 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
120 gain
= Gains
[c
].Target
;
121 Gains
[c
].Current
= gain
;
122 /* Mix until pos is aligned with 4 or the mix is done. */
123 for(;pos
< BufferSize
&& (pos
&3) != 0;pos
++)
124 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
127 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
129 gain4
= _mm_set1_ps(gain
);
130 for(;BufferSize
-pos
> 3;pos
+= 4)
132 const __m128 val4
= _mm_load_ps(&data
[pos
]);
133 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
134 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
135 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
137 for(;pos
< BufferSize
;pos
++)
138 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;