From 1b7c5540687d31d75b54ae85b3b7b4c2b5bc99ef Mon Sep 17 00:00:00 2001 From: Chris Robinson Date: Mon, 15 Dec 2014 17:13:31 -0800 Subject: [PATCH] Multiply samples with the cubic coeffs before transposing This avoids having to transpose the cubic coefficients. --- Alc/mixer_sse2.c | 13 +++++++------ Alc/mixer_sse41.c | 13 +++++++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/Alc/mixer_sse2.c b/Alc/mixer_sse2.c index 1c71b458..fbc2b629 100644 --- a/Alc/mixer_sse2.c +++ b/Alc/mixer_sse2.c @@ -106,14 +106,15 @@ const ALfloat *Resample_cubic32_SSE2(const ALfloat *src, ALuint frac, ALuint inc __m128 k3 = _mm_load_ps(CubicLUT[frac_.i[3]]); __m128 out; + val0 = _mm_mul_ps(val0, k0); + val1 = _mm_mul_ps(val1, k1); + val2 = _mm_mul_ps(val2, k2); + val3 = _mm_mul_ps(val3, k3); _MM_TRANSPOSE4_PS(val0, val1, val2, val3); - _MM_TRANSPOSE4_PS(k0, k1, k2, k3); + out = _mm_add_ps(val0, val1); + out = _mm_add_ps(out, val2); + out = _mm_add_ps(out, val3); - /* k0*val0 + k1*val1 + k2*val2 + k3*val3 */ - out = _mm_mul_ps(k0, val0); - out = _mm_add_ps(out, _mm_mul_ps(k1, val1)); - out = _mm_add_ps(out, _mm_mul_ps(k2, val2)); - out = _mm_add_ps(out, _mm_mul_ps(k3, val3)); _mm_store_ps(&dst[i], out); frac4 = _mm_add_epi32(frac4, increment4); diff --git a/Alc/mixer_sse41.c b/Alc/mixer_sse41.c index 9ea4379a..36f06255 100644 --- a/Alc/mixer_sse41.c +++ b/Alc/mixer_sse41.c @@ -110,14 +110,15 @@ const ALfloat *Resample_cubic32_SSE41(const ALfloat *src, ALuint frac, ALuint in __m128 k3 = _mm_load_ps(CubicLUT[frac_.i[3]]); __m128 out; + val0 = _mm_mul_ps(val0, k0); + val1 = _mm_mul_ps(val1, k1); + val2 = _mm_mul_ps(val2, k2); + val3 = _mm_mul_ps(val3, k3); _MM_TRANSPOSE4_PS(val0, val1, val2, val3); - _MM_TRANSPOSE4_PS(k0, k1, k2, k3); + out = _mm_add_ps(val0, val1); + out = _mm_add_ps(out, val2); + out = _mm_add_ps(out, val3); - /* k0*val0 + k1*val1 + k2*val2 + k3*val3 */ - out = _mm_mul_ps(k0, val0); - out = _mm_add_ps(out, _mm_mul_ps(k1, val1)); - out = _mm_add_ps(out, _mm_mul_ps(k2, val2)); - out = _mm_add_ps(out, _mm_mul_ps(k3, val3)); _mm_store_ps(&dst[i], out); frac4 = _mm_add_epi32(frac4, increment4); -- 2.11.4.GIT