Use SSE to do 4 samples at once (non-HRTF direct mix), instead of to apply a matrix row
[openal-soft.git] / Alc / mixer_sse.c
blobcc3b52cccbbdd0c04697865985fa369c930f0123
1 #include "config.h"
3 #ifdef HAVE_XMMINTRIN_H
4 #include <xmmintrin.h>
5 #endif
7 #include "AL/al.h"
8 #include "AL/alc.h"
9 #include "alMain.h"
10 #include "alu.h"
12 #include "alSource.h"
13 #include "mixer_defs.h"
16 static __inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*RESTRICT Values)[2],
17 ALfloat (*RESTRICT Coeffs)[2],
18 ALfloat (*RESTRICT CoeffStep)[2],
19 ALfloat left, ALfloat right)
21 const __m128 lrlr = { left, right, left, right };
22 __m128 vals = { 0.0f, 0.0f, 0.0f, 0.0f };
23 __m128 coeffs, coeffstep;
24 ALuint c;
25 for(c = 0;c < HRIR_LENGTH;c += 2)
27 const ALuint o0 = (Offset++)&HRIR_MASK;
28 const ALuint o1 = (Offset++)&HRIR_MASK;
30 coeffs = _mm_load_ps(&Coeffs[c][0]);
31 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
32 vals = _mm_loadh_pi(vals, (__m64*)&Values[o1][0]);
34 vals = _mm_add_ps(vals, _mm_mul_ps(coeffs, lrlr));
35 _mm_storel_pi((__m64*)&Values[o0][0], vals);
36 _mm_storeh_pi((__m64*)&Values[o1][0], vals);
38 coeffstep = _mm_load_ps(&CoeffStep[c][0]);
39 coeffs = _mm_add_ps(coeffs, coeffstep);
40 _mm_store_ps(&Coeffs[c][0], coeffs);
44 static __inline void ApplyCoeffs(ALuint Offset, ALfloat (*RESTRICT Values)[2],
45 ALfloat (*RESTRICT Coeffs)[2],
46 ALfloat left, ALfloat right)
48 const __m128 lrlr = { left, right, left, right };
49 __m128 vals = { 0.0f, 0.0f, 0.0f, 0.0f };
50 __m128 coeffs;
51 ALuint c;
52 for(c = 0;c < HRIR_LENGTH;c += 2)
54 const ALuint o0 = (Offset++)&HRIR_MASK;
55 const ALuint o1 = (Offset++)&HRIR_MASK;
57 coeffs = _mm_load_ps(&Coeffs[c][0]);
58 vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
59 vals = _mm_loadh_pi(vals, (__m64*)&Values[o1][0]);
61 vals = _mm_add_ps(vals, _mm_mul_ps(coeffs, lrlr));
62 _mm_storel_pi((__m64*)&Values[o0][0], vals);
63 _mm_storeh_pi((__m64*)&Values[o1][0], vals);
68 void MixDirect_SSE(ALsource *Source, ALCdevice *Device, DirectParams *params,
69 const ALfloat *RESTRICT data, ALuint srcchan,
70 ALuint OutPos, ALuint SamplesToDo, ALuint BufferSize)
72 ALfloat (*RESTRICT DryBuffer)[MaxChannels];
73 ALfloat *RESTRICT ClickRemoval, *RESTRICT PendingClicks;
74 ALIGN(16) ALfloat DrySend[MaxChannels];
75 ALIGN(16) ALfloat value[4];
76 FILTER *DryFilter;
77 ALuint pos;
78 ALuint c;
79 (void)Source;
81 DryBuffer = Device->DryBuffer;
82 ClickRemoval = Device->ClickRemoval;
83 PendingClicks = Device->PendingClicks;
84 DryFilter = &params->iirFilter;
86 for(c = 0;c < MaxChannels;c++)
87 DrySend[c] = params->Gains[srcchan][c];
89 pos = 0;
90 if(OutPos == 0)
92 value[0] = lpFilter2PC(DryFilter, srcchan, data[pos]);
93 for(c = 0;c < MaxChannels;c++)
94 ClickRemoval[c] -= value[0]*DrySend[c];
96 for(pos = 0;pos < BufferSize-3;pos += 4)
98 __m128 val4;
100 value[0] = lpFilter2P(DryFilter, srcchan, data[pos ]);
101 value[1] = lpFilter2P(DryFilter, srcchan, data[pos+1]);
102 value[2] = lpFilter2P(DryFilter, srcchan, data[pos+2]);
103 value[3] = lpFilter2P(DryFilter, srcchan, data[pos+3]);
104 val4 = _mm_load_ps(value);
106 for(c = 0;c < MaxChannels;c++)
108 const __m128 gain = _mm_set1_ps(DrySend[c]);
109 __m128 dry4;
111 value[0] = DryBuffer[OutPos ][c];
112 value[1] = DryBuffer[OutPos+1][c];
113 value[2] = DryBuffer[OutPos+2][c];
114 value[3] = DryBuffer[OutPos+3][c];
115 dry4 = _mm_load_ps(value);
117 dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain));
119 _mm_store_ps(value, dry4);
120 DryBuffer[OutPos ][c] = value[0];
121 DryBuffer[OutPos+1][c] = value[1];
122 DryBuffer[OutPos+2][c] = value[2];
123 DryBuffer[OutPos+3][c] = value[3];
126 OutPos += 4;
128 for(;pos < BufferSize;pos++)
130 value[0] = lpFilter2P(DryFilter, srcchan, data[pos]);
131 for(c = 0;c < MaxChannels;c++)
132 DryBuffer[OutPos][c] += value[0]*DrySend[c];
133 OutPos++;
135 if(OutPos == SamplesToDo)
137 value[0] = lpFilter2PC(DryFilter, srcchan, data[pos]);
138 for(c = 0;c < MaxChannels;c++)
139 PendingClicks[c] += value[0]*DrySend[c];
142 #define NO_MIXDIRECT
145 #define SUFFIX SSE
146 #include "mixer_inc.c"
147 #undef SUFFIX