11 #include "alAuxEffectSlot.h"
12 #include "mixer_defs.h"
15 const ALfloat
*Resample_bsinc_SSE(const InterpState
*state
, const ALfloat
*restrict src
,
16 ALsizei frac
, ALint increment
, ALfloat
*restrict dst
,
19 const ALfloat
*const filter
= state
->bsinc
.filter
;
20 const __m128 sf4
= _mm_set1_ps(state
->bsinc
.sf
);
21 const ALsizei m
= state
->bsinc
.m
;
22 const __m128
*fil
, *scd
, *phd
, *spd
;
23 ALsizei pi
, i
, j
, offset
;
27 src
+= state
->bsinc
.l
;
28 for(i
= 0;i
< dstlen
;i
++)
30 // Calculate the phase index and factor.
31 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
32 pi
= frac
>> FRAC_PHASE_BITDIFF
;
33 pf
= (frac
& ((1<<FRAC_PHASE_BITDIFF
)-1)) * (1.0f
/(1<<FRAC_PHASE_BITDIFF
));
34 #undef FRAC_PHASE_BITDIFF
37 fil
= ASSUME_ALIGNED(filter
+ offset
, 16); offset
+= m
;
38 scd
= ASSUME_ALIGNED(filter
+ offset
, 16); offset
+= m
;
39 phd
= ASSUME_ALIGNED(filter
+ offset
, 16); offset
+= m
;
40 spd
= ASSUME_ALIGNED(filter
+ offset
, 16);
42 // Apply the scale and phase interpolated filter.
43 r4
= _mm_setzero_ps();
45 const __m128 pf4
= _mm_set1_ps(pf
);
46 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
47 for(j
= 0;j
< m
;j
+=4,fil
++,scd
++,phd
++,spd
++)
49 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
50 const __m128 f4
= MLA4(
51 MLA4(*fil
, sf4
, *scd
),
52 pf4
, MLA4(*phd
, sf4
, *spd
)
55 r4
= MLA4(r4
, f4
, _mm_loadu_ps(&src
[j
]));
59 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
60 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
61 dst
[i
] = _mm_cvtss_f32(r4
);
64 src
+= frac
>>FRACTIONBITS
;
71 static inline void ApplyCoeffs(ALsizei Offset
, ALfloat (*restrict Values
)[2],
73 const ALfloat (*restrict Coeffs
)[2],
74 ALfloat left
, ALfloat right
)
76 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
77 __m128 vals
= _mm_setzero_ps();
81 Values
= ASSUME_ALIGNED(Values
, 16);
82 Coeffs
= ASSUME_ALIGNED(Coeffs
, 16);
85 const ALsizei o0
= Offset
&HRIR_MASK
;
86 const ALsizei o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
89 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
90 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
91 imp0
= _mm_mul_ps(lrlr
, coeffs
);
92 vals
= _mm_add_ps(imp0
, vals
);
93 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
94 for(i
= 1;i
< IrSize
-1;i
+= 2)
96 const ALsizei o2
= (Offset
+i
)&HRIR_MASK
;
98 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
99 vals
= _mm_load_ps(&Values
[o2
][0]);
100 imp1
= _mm_mul_ps(lrlr
, coeffs
);
101 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
102 vals
= _mm_add_ps(imp0
, vals
);
103 _mm_store_ps(&Values
[o2
][0], vals
);
106 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
107 imp0
= _mm_movehl_ps(imp0
, imp0
);
108 vals
= _mm_add_ps(imp0
, vals
);
109 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
113 for(i
= 0;i
< IrSize
;i
+= 2)
115 const ALsizei o
= (Offset
+ i
)&HRIR_MASK
;
117 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
118 vals
= _mm_load_ps(&Values
[o
][0]);
119 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
120 _mm_store_ps(&Values
[o
][0], vals
);
125 #define MixHrtf MixHrtf_SSE
126 #define MixHrtfBlend MixHrtfBlend_SSE
127 #define MixDirectHrtf MixDirectHrtf_SSE
128 #include "mixer_inc.c"
132 void Mix_SSE(const ALfloat
*data
, ALsizei OutChans
, ALfloat (*restrict OutBuffer
)[BUFFERSIZE
],
133 ALfloat
*CurrentGains
, const ALfloat
*TargetGains
, ALsizei Counter
, ALsizei OutPos
,
136 ALfloat gain
, delta
, step
;
140 delta
= (Counter
> 0) ? 1.0f
/(ALfloat
)Counter
: 0.0f
;
142 for(c
= 0;c
< OutChans
;c
++)
145 gain
= CurrentGains
[c
];
146 step
= (TargetGains
[c
] - gain
) * delta
;
147 if(fabsf(step
) > FLT_EPSILON
)
149 ALsizei minsize
= mini(BufferSize
, Counter
);
150 /* Mix with applying gain steps in aligned multiples of 4. */
158 gain
+ step
+ step
+ step
160 step4
= _mm_set1_ps(step
+ step
+ step
+ step
);
162 const __m128 val4
= _mm_load_ps(&data
[pos
]);
163 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
164 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
165 gain4
= _mm_add_ps(gain4
, step4
);
166 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
168 } while(minsize
-pos
> 3);
169 /* NOTE: gain4 now represents the next four gains after the
170 * last four mixed samples, so the lowest element represents
171 * the next gain to apply.
173 gain
= _mm_cvtss_f32(gain4
);
175 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
176 for(;pos
< minsize
;pos
++)
178 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
182 gain
= TargetGains
[c
];
183 CurrentGains
[c
] = gain
;
185 /* Mix until pos is aligned with 4 or the mix is done. */
186 minsize
= mini(BufferSize
, (pos
+3)&~3);
187 for(;pos
< minsize
;pos
++)
188 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
191 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
193 gain4
= _mm_set1_ps(gain
);
194 for(;BufferSize
-pos
> 3;pos
+= 4)
196 const __m128 val4
= _mm_load_ps(&data
[pos
]);
197 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
198 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
199 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
201 for(;pos
< BufferSize
;pos
++)
202 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
206 void MixRow_SSE(ALfloat
*OutBuffer
, const ALfloat
*Gains
, const ALfloat (*restrict data
)[BUFFERSIZE
], ALsizei InChans
, ALsizei InPos
, ALsizei BufferSize
)
211 for(c
= 0;c
< InChans
;c
++)
214 ALfloat gain
= Gains
[c
];
215 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
218 gain4
= _mm_set1_ps(gain
);
219 for(;BufferSize
-pos
> 3;pos
+= 4)
221 const __m128 val4
= _mm_load_ps(&data
[c
][InPos
+pos
]);
222 __m128 dry4
= _mm_load_ps(&OutBuffer
[pos
]);
223 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
224 _mm_store_ps(&OutBuffer
[pos
], dry4
);
226 for(;pos
< BufferSize
;pos
++)
227 OutBuffer
[pos
] += data
[c
][InPos
+pos
]*gain
;