11 #include "alAuxEffectSlot.h"
12 #include "mixer_defs.h"
15 const ALfloat
*Resample_bsinc32_SSE(const InterpState
*state
, const ALfloat
*restrict src
,
16 ALsizei frac
, ALint increment
, ALfloat
*restrict dst
,
19 const ALfloat
*filter
= state
->bsinc
.filter
;
20 const __m128 sf4
= _mm_set1_ps(state
->bsinc
.sf
);
21 const ALsizei m
= state
->bsinc
.m
;
22 const ALfloat
*fil
, *scd
, *phd
, *spd
;
27 src
+= state
->bsinc
.l
;
28 for(i
= 0;i
< dstlen
;i
++)
30 // Calculate the phase index and factor.
31 #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
32 pi
= frac
>> FRAC_PHASE_BITDIFF
;
33 pf
= (frac
& ((1<<FRAC_PHASE_BITDIFF
)-1)) * (1.0f
/(1<<FRAC_PHASE_BITDIFF
));
34 #undef FRAC_PHASE_BITDIFF
36 fil
= ASSUME_ALIGNED(filter
+ m
*pi
*4, 16);
37 scd
= ASSUME_ALIGNED(fil
+ m
, 16);
38 phd
= ASSUME_ALIGNED(scd
+ m
, 16);
39 spd
= ASSUME_ALIGNED(phd
+ m
, 16);
41 // Apply the scale and phase interpolated filter.
42 r4
= _mm_setzero_ps();
44 const __m128 pf4
= _mm_set1_ps(pf
);
45 #define LD4(x) _mm_load_ps(x)
46 #define ULD4(x) _mm_loadu_ps(x)
47 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
50 /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
51 const __m128 f4
= MLA4(MLA4(LD4(&fil
[j
]), sf4
, LD4(&scd
[j
])),
52 pf4
, MLA4(LD4(&phd
[j
]), sf4
, LD4(&spd
[j
]))
55 r4
= MLA4(r4
, f4
, ULD4(&src
[j
]));
61 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
62 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
63 dst
[i
] = _mm_cvtss_f32(r4
);
66 src
+= frac
>>FRACTIONBITS
;
73 static inline void ApplyCoeffs(ALsizei Offset
, ALfloat (*restrict Values
)[2],
75 const ALfloat (*restrict Coeffs
)[2],
76 ALfloat left
, ALfloat right
)
78 const __m128 lrlr
= _mm_setr_ps(left
, right
, left
, right
);
79 __m128 vals
= _mm_setzero_ps();
83 Values
= ASSUME_ALIGNED(Values
, 16);
84 Coeffs
= ASSUME_ALIGNED(Coeffs
, 16);
87 const ALsizei o0
= Offset
&HRIR_MASK
;
88 const ALsizei o1
= (Offset
+IrSize
-1)&HRIR_MASK
;
91 coeffs
= _mm_load_ps(&Coeffs
[0][0]);
92 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o0
][0]);
93 imp0
= _mm_mul_ps(lrlr
, coeffs
);
94 vals
= _mm_add_ps(imp0
, vals
);
95 _mm_storel_pi((__m64
*)&Values
[o0
][0], vals
);
96 for(i
= 1;i
< IrSize
-1;i
+= 2)
98 const ALsizei o2
= (Offset
+i
)&HRIR_MASK
;
100 coeffs
= _mm_load_ps(&Coeffs
[i
+1][0]);
101 vals
= _mm_load_ps(&Values
[o2
][0]);
102 imp1
= _mm_mul_ps(lrlr
, coeffs
);
103 imp0
= _mm_shuffle_ps(imp0
, imp1
, _MM_SHUFFLE(1, 0, 3, 2));
104 vals
= _mm_add_ps(imp0
, vals
);
105 _mm_store_ps(&Values
[o2
][0], vals
);
108 vals
= _mm_loadl_pi(vals
, (__m64
*)&Values
[o1
][0]);
109 imp0
= _mm_movehl_ps(imp0
, imp0
);
110 vals
= _mm_add_ps(imp0
, vals
);
111 _mm_storel_pi((__m64
*)&Values
[o1
][0], vals
);
115 for(i
= 0;i
< IrSize
;i
+= 2)
117 const ALsizei o
= (Offset
+ i
)&HRIR_MASK
;
119 coeffs
= _mm_load_ps(&Coeffs
[i
][0]);
120 vals
= _mm_load_ps(&Values
[o
][0]);
121 vals
= _mm_add_ps(vals
, _mm_mul_ps(lrlr
, coeffs
));
122 _mm_store_ps(&Values
[o
][0], vals
);
127 #define MixHrtf MixHrtf_SSE
128 #define MixHrtfBlend MixHrtfBlend_SSE
129 #define MixDirectHrtf MixDirectHrtf_SSE
130 #include "mixer_inc.c"
134 void Mix_SSE(const ALfloat
*data
, ALsizei OutChans
, ALfloat (*restrict OutBuffer
)[BUFFERSIZE
],
135 ALfloat
*CurrentGains
, const ALfloat
*TargetGains
, ALsizei Counter
, ALsizei OutPos
,
138 ALfloat gain
, delta
, step
;
142 delta
= (Counter
> 0) ? 1.0f
/(ALfloat
)Counter
: 0.0f
;
144 for(c
= 0;c
< OutChans
;c
++)
147 gain
= CurrentGains
[c
];
148 step
= (TargetGains
[c
] - gain
) * delta
;
149 if(fabsf(step
) > FLT_EPSILON
)
151 ALsizei minsize
= mini(BufferSize
, Counter
);
152 /* Mix with applying gain steps in aligned multiples of 4. */
160 gain
+ step
+ step
+ step
162 step4
= _mm_set1_ps(step
+ step
+ step
+ step
);
164 const __m128 val4
= _mm_load_ps(&data
[pos
]);
165 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
166 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
167 gain4
= _mm_add_ps(gain4
, step4
);
168 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
170 } while(minsize
-pos
> 3);
171 /* NOTE: gain4 now represents the next four gains after the
172 * last four mixed samples, so the lowest element represents
173 * the next gain to apply.
175 gain
= _mm_cvtss_f32(gain4
);
177 /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
178 for(;pos
< minsize
;pos
++)
180 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
184 gain
= TargetGains
[c
];
185 CurrentGains
[c
] = gain
;
187 /* Mix until pos is aligned with 4 or the mix is done. */
188 minsize
= mini(BufferSize
, (pos
+3)&~3);
189 for(;pos
< minsize
;pos
++)
190 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
193 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
195 gain4
= _mm_set1_ps(gain
);
196 for(;BufferSize
-pos
> 3;pos
+= 4)
198 const __m128 val4
= _mm_load_ps(&data
[pos
]);
199 __m128 dry4
= _mm_load_ps(&OutBuffer
[c
][OutPos
+pos
]);
200 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
201 _mm_store_ps(&OutBuffer
[c
][OutPos
+pos
], dry4
);
203 for(;pos
< BufferSize
;pos
++)
204 OutBuffer
[c
][OutPos
+pos
] += data
[pos
]*gain
;
208 void MixRow_SSE(ALfloat
*OutBuffer
, const ALfloat
*Gains
, const ALfloat (*restrict data
)[BUFFERSIZE
], ALsizei InChans
, ALsizei InPos
, ALsizei BufferSize
)
213 for(c
= 0;c
< InChans
;c
++)
216 ALfloat gain
= Gains
[c
];
217 if(!(fabsf(gain
) > GAIN_SILENCE_THRESHOLD
))
220 gain4
= _mm_set1_ps(gain
);
221 for(;BufferSize
-pos
> 3;pos
+= 4)
223 const __m128 val4
= _mm_load_ps(&data
[c
][InPos
+pos
]);
224 __m128 dry4
= _mm_load_ps(&OutBuffer
[pos
]);
225 dry4
= _mm_add_ps(dry4
, _mm_mul_ps(val4
, gain4
));
226 _mm_store_ps(&OutBuffer
[pos
], dry4
);
228 for(;pos
< BufferSize
;pos
++)
229 OutBuffer
[pos
] += data
[c
][InPos
+pos
]*gain
;