11 #include "alAuxEffectSlot.h"
12 #include "mixer_defs.h"
15 const ALfloat
*Resample_bsinc32_SSE(const InterpState
*state
, const ALfloat
*restrict src
,
16 ALsizei frac
, ALint increment
, ALfloat
*restrict dst
,
19 const __m128 sf4
= _mm_set1_ps(state
->bsinc
.sf
);
20 const ALsizei m
= state
->bsinc
.m
;
21 const ALfloat
*fil
, *scd
, *phd
, *spd
;
26 src
+= state
->bsinc
.l
;
27 for(i
= 0;i
< dstlen
;i
++)
29 // Calculate the phase index and factor.
30 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
31 pi
= frac
>> FRAC_PHASE_BITDIFF
;
32 pf
= (frac
& ((1<<FRAC_PHASE_BITDIFF
)-1)) * (1.0f
/(1<<FRAC_PHASE_BITDIFF
));
33 #undef FRAC_PHASE_BITDIFF
35 fil
= ASSUME_ALIGNED(state
->bsinc
.coeffs
[pi
].filter
, 16);
36 scd
= ASSUME_ALIGNED(state
->bsinc
.coeffs
[pi
].scDelta
, 16);
37 phd
= ASSUME_ALIGNED(state
->bsinc
.coeffs
[pi
].phDelta
, 16);
38 spd
= ASSUME_ALIGNED(state
->bsinc
.coeffs
[pi
].spDelta
, 16);
40 // Apply the scale and phase interpolated filter.
41 r4
= _mm_setzero_ps();
43 const __m128 pf4
= _mm_set1_ps(pf
);
44 #define LD4(x) _mm_load_ps(x)
45 #define ULD4(x) _mm_loadu_ps(x)
46 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
49 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
50 const __m128 f4
= MLA4(MLA4(LD4(&fil
[j
]), sf4
, LD4(&scd
[j
])),
51 pf4
, MLA4(LD4(&phd
[j
]), sf4
, LD4(&spd
[j
]))
54 r4
= MLA4(r4
, f4
, ULD4(&src
[j
]));
60 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
61 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
62 dst
[i
] = _mm_cvtss_f32(r4
);
65 src
+= frac
>>FRACTIONBITS
;
72 static inline void ApplyCoeffs(ALsizei Offset
, ALfloat (*restrict Values
)[2],
74 const ALfloat (*restrict Coeffs
)[2],
75 ALfloat left
, ALfloat right
)
77 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
78 __m128 vals
= _mm_setzero_ps();
82 Values
= ASSUME_ALIGNED(Values
, 16);
83 Coeffs
= ASSUME_ALIGNED(Coeffs
, 16);
86 const ALsizei o0
= Offset
&HRIR_MASK
;
87 const ALsizei o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
90 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
91 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
92 imp0
= _mm_mul_ps(lrlr
, coeffs
);
93 vals
= _mm_add_ps(imp0
, vals
);
94 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
95 for(i
= 1;i
< IrSize
-1;i
+= 2)
97 const ALsizei o2
= (Offset
+i
)&HRIR_MASK
;
99 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
100 vals
= _mm_load_ps(&Values
[o2
][0]);
101 imp1
= _mm_mul_ps(lrlr
, coeffs
);
102 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
103 vals
= _mm_add_ps(imp0
, vals
);
104 _mm_store_ps(&Values
[o2
][0], vals
);
107 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
108 imp0
= _mm_movehl_ps(imp0
, imp0
);
109 vals
= _mm_add_ps(imp0
, vals
);
110 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
114 for(i
= 0;i
< IrSize
;i
+= 2)
116 const ALsizei o
= (Offset
+ i
)&HRIR_MASK
;
118 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
119 vals
= _mm_load_ps(&Values
[o
][0]);
120 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
121 _mm_store_ps(&Values
[o
][0], vals
);
126 #define MixHrtf MixHrtf_SSE
127 #define MixHrtfBlend MixHrtfBlend_SSE
128 #define MixDirectHrtf MixDirectHrtf_SSE
129 #include "mixer_inc.c"
133 void Mix_SSE(const ALfloat
*data
, ALsizei OutChans
, ALfloat (*restrict OutBuffer
)[BUFFERSIZE
],
134 ALfloat
*CurrentGains
, const ALfloat
*TargetGains
, ALsizei Counter
, ALsizei OutPos
,
137 ALfloat gain
, delta
, step
;
141 delta
= (Counter
> 0) ? 1.0f
/(ALfloat
)Counter
: 0.0f
;
143 for(c
= 0;c
< OutChans
;c
++)
146 gain
= CurrentGains
[c
];
147 step
= (TargetGains
[c
] - gain
) * delta
;
148 if(fabsf(step
) > FLT_EPSILON
)
150 ALsizei minsize
= mini(BufferSize
, Counter
);
151 /* Mix with applying gain steps in aligned multiples of 4. */
159 gain
+ step
+ step
+ step
161 step4
= _mm_set1_ps(step
+ step
+ step
+ step
);
163 const __m128 val4
= _mm_load_ps(&data
[pos
]);
164 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
165 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
166 gain4
= _mm_add_ps(gain4
, step4
);
167 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
169 } while(minsize
-pos
> 3);
170 /* NOTE: gain4 now represents the next four gains after the
171 * last four mixed samples, so the lowest element represents
172 * the next gain to apply.
174 gain
= _mm_cvtss_f32(gain4
);
176 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
177 for(;pos
< minsize
;pos
++)
179 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
183 gain
= TargetGains
[c
];
184 CurrentGains
[c
] = gain
;
186 /* Mix until pos is aligned with 4 or the mix is done. */
187 minsize
= mini(BufferSize
, (pos
+3)&~3);
188 for(;pos
< minsize
;pos
++)
189 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
192 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
194 gain4
= _mm_set1_ps(gain
);
195 for(;BufferSize
-pos
> 3;pos
+= 4)
197 const __m128 val4
= _mm_load_ps(&data
[pos
]);
198 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
199 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
200 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
202 for(;pos
< BufferSize
;pos
++)
203 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
207 void MixRow_SSE(ALfloat
*OutBuffer
, const ALfloat
*Gains
, const ALfloat (*restrict data
)[BUFFERSIZE
], ALsizei InChans
, ALsizei InPos
, ALsizei BufferSize
)
212 for(c
= 0;c
< InChans
;c
++)
215 ALfloat gain
= Gains
[c
];
216 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
219 gain4
= _mm_set1_ps(gain
);
220 for(;BufferSize
-pos
> 3;pos
+= 4)
222 const __m128 val4
= _mm_load_ps(&data
[c
][InPos
+pos
]);
223 __m128 dry4
= _mm_load_ps(&OutBuffer
[pos
]);
224 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
225 _mm_store_ps(&OutBuffer
[pos
], dry4
);
227 for(;pos
< BufferSize
;pos
++)
228 OutBuffer
[pos
] += data
[c
][InPos
+pos
]*gain
;