11 #include "alAuxEffectSlot.h"
15 const ALfloat
*Resample_bsinc_SSE(const InterpState
*state
, const ALfloat
*restrict src
,
16 ALsizei frac
, ALint increment
, ALfloat
*restrict dst
,
19 const ALfloat
*const filter
= state
->bsinc
.filter
;
20 const __m128 sf4
= _mm_set1_ps(state
->bsinc
.sf
);
21 const ALsizei m
= state
->bsinc
.m
;
22 const __m128
*fil
, *scd
, *phd
, *spd
;
23 ALsizei pi
, i
, j
, offset
;
30 src
+= state
->bsinc
.l
;
31 for(i
= 0;i
< dstlen
;i
++)
33 // Calculate the phase index and factor.
34 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
35 pi
= frac
>> FRAC_PHASE_BITDIFF
;
36 pf
= (frac
& ((1<<FRAC_PHASE_BITDIFF
)-1)) * (1.0f
/(1<<FRAC_PHASE_BITDIFF
));
37 #undef FRAC_PHASE_BITDIFF
40 fil
= (const __m128
*)ASSUME_ALIGNED(filter
+ offset
, 16); offset
+= m
;
41 scd
= (const __m128
*)ASSUME_ALIGNED(filter
+ offset
, 16); offset
+= m
;
42 phd
= (const __m128
*)ASSUME_ALIGNED(filter
+ offset
, 16); offset
+= m
;
43 spd
= (const __m128
*)ASSUME_ALIGNED(filter
+ offset
, 16);
45 // Apply the scale and phase interpolated filter.
46 r4
= _mm_setzero_ps();
48 const __m128 pf4
= _mm_set1_ps(pf
);
49 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
50 for(j
= 0;j
< m
;j
+=4,fil
++,scd
++,phd
++,spd
++)
52 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
53 const __m128 f4
= MLA4(
54 MLA4(*fil
, sf4
, *scd
),
55 pf4
, MLA4(*phd
, sf4
, *spd
)
58 r4
= MLA4(r4
, f4
, _mm_loadu_ps(&src
[j
]));
62 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
63 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
64 dst
[i
] = _mm_cvtss_f32(r4
);
67 src
+= frac
>>FRACTIONBITS
;
74 static inline void ApplyCoeffs(ALsizei Offset
, ALfloat (*restrict Values
)[2],
76 const ALfloat (*restrict Coeffs
)[2],
77 ALfloat left
, ALfloat right
)
79 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
80 __m128 vals
= _mm_setzero_ps();
84 Values
= ASSUME_ALIGNED(Values
, 16);
85 Coeffs
= ASSUME_ALIGNED(Coeffs
, 16);
88 const ALsizei o0
= Offset
&HRIR_MASK
;
89 const ALsizei o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
92 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
93 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
94 imp0
= _mm_mul_ps(lrlr
, coeffs
);
95 vals
= _mm_add_ps(imp0
, vals
);
96 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
97 for(i
= 1;i
< IrSize
-1;i
+= 2)
99 const ALsizei o2
= (Offset
+i
)&HRIR_MASK
;
101 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
102 vals
= _mm_load_ps(&Values
[o2
][0]);
103 imp1
= _mm_mul_ps(lrlr
, coeffs
);
104 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
105 vals
= _mm_add_ps(imp0
, vals
);
106 _mm_store_ps(&Values
[o2
][0], vals
);
109 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
110 imp0
= _mm_movehl_ps(imp0
, imp0
);
111 vals
= _mm_add_ps(imp0
, vals
);
112 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
116 for(i
= 0;i
< IrSize
;i
+= 2)
118 const ALsizei o
= (Offset
+ i
)&HRIR_MASK
;
120 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
121 vals
= _mm_load_ps(&Values
[o
][0]);
122 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
123 _mm_store_ps(&Values
[o
][0], vals
);
128 #define MixHrtf MixHrtf_SSE
129 #define MixHrtfBlend MixHrtfBlend_SSE
130 #define MixDirectHrtf MixDirectHrtf_SSE
131 #include "hrtf_inc.c"
134 void Mix_SSE(const ALfloat
*data
, ALsizei OutChans
, ALfloat (*restrict OutBuffer
)[BUFFERSIZE
],
135 ALfloat
*CurrentGains
, const ALfloat
*TargetGains
, ALsizei Counter
, ALsizei OutPos
,
138 const ALfloat delta
= (Counter
> 0) ? 1.0f
/(ALfloat
)Counter
: 0.0f
;
141 ASSUME(OutChans
> 0);
142 ASSUME(BufferSize
> 0);
144 for(c
= 0;c
< OutChans
;c
++)
147 ALfloat gain
= CurrentGains
[c
];
148 const ALfloat step
= (TargetGains
[c
] - gain
) * delta
;
150 if(fabsf(step
) > FLT_EPSILON
)
152 ALsizei minsize
= mini(BufferSize
, Counter
);
153 ALfloat step_count
= 0.0f
;
154 /* Mix with applying gain steps in aligned multiples of 4. */
155 if(LIKELY(minsize
> 3))
157 const __m128 four4
= _mm_set1_ps(4.0f
);
158 const __m128 step4
= _mm_set1_ps(step
);
159 const __m128 gain4
= _mm_set1_ps(gain
);
160 __m128 step_count4
= _mm_setr_ps(0.0f
, 1.0f
, 2.0f
, 3.0f
);
161 ALsizei todo
= minsize
>> 2;
163 const __m128 val4
= _mm_load_ps(&data
[pos
]);
164 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
165 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
166 /* dry += val * (gain + step*step_count) */
167 dry4
= MLA4(dry4
, val4
, MLA4(gain4
, step4
, step_count4
));
169 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
170 step_count4
= _mm_add_ps(step_count4
, four4
);
173 /* NOTE: step_count4 now represents the next four counts after
174 * the last four mixed samples, so the lowest element
175 * represents the next step count to apply.
177 step_count
= _mm_cvtss_f32(step_count4
);
179 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
180 for(;pos
< minsize
;pos
++)
182 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*(gain
+ step
*step_count
);
186 gain
= TargetGains
[c
];
188 gain
+= step
*step_count
;
189 CurrentGains
[c
] = gain
;
191 /* Mix until pos is aligned with 4 or the mix is done. */
192 minsize
= mini(BufferSize
, (pos
+3)&~3);
193 for(;pos
< minsize
;pos
++)
194 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
197 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
199 if(LIKELY(BufferSize
-pos
> 3))
201 ALsizei todo
= (BufferSize
-pos
) >> 2;
202 const __m128 gain4
= _mm_set1_ps(gain
);
204 const __m128 val4
= _mm_load_ps(&data
[pos
]);
205 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
206 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
207 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
211 for(;pos
< BufferSize
;pos
++)
212 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
216 void MixRow_SSE(ALfloat
*OutBuffer
, const ALfloat
*Gains
, const ALfloat (*restrict data
)[BUFFERSIZE
], ALsizei InChans
, ALsizei InPos
, ALsizei BufferSize
)
221 ASSUME(BufferSize
> 0);
223 for(c
= 0;c
< InChans
;c
++)
226 ALfloat gain
= Gains
[c
];
227 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
230 if(LIKELY(BufferSize
> 3))
232 ALsizei todo
= BufferSize
>> 2;
233 const __m128 gain4
= _mm_set1_ps(gain
);
235 const __m128 val4
= _mm_load_ps(&data
[c
][InPos
+pos
]);
236 __m128 dry4
= _mm_load_ps(&OutBuffer
[pos
]);
237 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
238 _mm_store_ps(&OutBuffer
[pos
], dry4
);
242 for(;pos
< BufferSize
;pos
++)
243 OutBuffer
[pos
] += data
[c
][InPos
+pos
]*gain
;