From d1c93525ed6f3e2ff42d6e7dcaf0c75521fa1556 Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Tue, 18 Sep 2012 18:08:57 -0700
Subject: [PATCH] Fix matrix multiply used by the SSE cubic resampler

Also remove the 4-sample loop. It's not terribly effective.
---
 Alc/mixer_sse.c | 85 +++++++++++++++------------------------------------------
 1 file changed, 22 insertions(+), 63 deletions(-)

diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c
index d06455dd..22a7db40 100644
--- a/Alc/mixer_sse.c
+++ b/Alc/mixer_sse.c
@@ -60,84 +60,43 @@ void Resample_lerp32_SSE(const ALfloat *data, ALuint frac,
 }
 
 void Resample_cubic32_SSE(const ALfloat *data, ALuint frac,
-  ALuint increment, ALuint NumChannels, ALfloat *RESTRICT OutBuffer,
+  ALuint increment, ALuint channels, ALfloat *RESTRICT OutBuffer,
   ALuint BufferSize)
 {
     /* Cubic interpolation mainly consists of a matrix4 * vector4 operation,
      * followed by scalars being applied to the resulting elements before all
      * four are added together for the final sample. */
     static const __m128 matrix[4] = {
-        { -0.5,  1.0f, -0.5f,  0.0f },
-        {  1.5, -2.5f,  0.0f,  1.0f },
-        { -1.5,  2.0f,  0.5f,  0.0f },
-        {  0.5, -0.5f,  0.0f,  0.0f },
+        { -0.5f,  1.0f, -0.5f,  0.0f },
+        {  1.5f, -2.5f,  0.0f,  1.0f },
+        { -1.5f,  2.0f,  0.5f,  0.0f },
+        {  0.5f, -0.5f,  0.0f,  0.0f },
     };
     ALIGN(16) float value[4];
     ALuint pos = 0;
-    ALuint i, j;
-
-    for(i = 0;i < BufferSize+1-3;i+=4)
-    {
-        __m128 result, final[4];
-
-        for(j = 0;j < 4;j++)
-        {
-            __m128 val4, s;
-            ALfloat mu;
-
-            val4 = _mm_set_ps(data[(pos-1)*NumChannels],
-                              data[(pos  )*NumChannels],
-                              data[(pos+1)*NumChannels],
-                              data[(pos+2)*NumChannels]);
-            mu = frac * (1.0f/FRACTIONONE);
-            s = _mm_set_ps(1.0f, mu, mu*mu, mu*mu*mu);
-
-            /* result = matrix * val4 */
-            result =                    _mm_mul_ps(val4, matrix[0]) ;
-            result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[1]));
-            result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[2]));
-            result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[3]));
-
-            /* final[j] = result * { mu^0, mu^1, mu^2, mu^3 } */
-            final[j] = _mm_mul_ps(result, s);
+    ALuint i;
 
-            frac += increment;
-            pos  += frac>>FRACTIONBITS;
-            frac &= FRACTIONMASK;
-        }
-        /* Transpose the final "matrix" so adding the rows will give the four
-         * samples. TODO: Is this faster than doing..
-         * _mm_store_ps(value, result);
-         * OutBuffer[i] = value[0] + value[1] + value[2] + value[3];
-         * ..for each sample?
-         */
-        _MM_TRANSPOSE4_PS(final[0], final[1], final[2], final[3]);
-        result = _mm_add_ps(_mm_add_ps(final[0], final[1]),
-                            _mm_add_ps(final[2], final[3]));
-
-        _mm_store_ps(&OutBuffer[i], result);
-    }
-    for(;i < BufferSize+1;i++)
+    for(i = 0;i < BufferSize+1;i++)
     {
-        __m128 val4, s, result;
+        __m128 res1, res2;
         ALfloat mu;
 
-        val4 = _mm_set_ps(data[(pos-1)*NumChannels],
-                          data[(pos  )*NumChannels],
-                          data[(pos+1)*NumChannels],
-                          data[(pos+2)*NumChannels]);
-        mu = frac * (1.0f/FRACTIONONE);
-        s = _mm_set_ps(1.0f, mu, mu*mu, mu*mu*mu);
+        /* matrix * { samples } */
+        res1 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(data[(pos-1)*channels]), matrix[0]),
+                          _mm_mul_ps(_mm_set1_ps(data[(pos  )*channels]), matrix[1]));
+        res2 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(data[(pos+1)*channels]), matrix[2]),
+                          _mm_mul_ps(_mm_set1_ps(data[(pos+2)*channels]), matrix[3]));
+        res1 = _mm_add_ps(res1, res2);
 
-        /* result = matrix * val4 */
-        result =                    _mm_mul_ps(val4, matrix[0]) ;
-        result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[1]));
-        result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[2]));
-        result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[3]));
-
-        /* value = result * { mu^0, mu^1, mu^2, mu^3 } */
-        _mm_store_ps(value, _mm_mul_ps(result, s));
+        /* res1 * { mu^3, mu^2, mu^1, mu^0 } */
+        mu = frac * (1.0f/FRACTIONONE);
+        value[0] = mu*mu*mu;
+        value[1] = mu*mu;
+        value[2] = mu;
+        value[3] = 1.0f;
+        res1 = _mm_mul_ps(res1, _mm_load_ps(value));
 
+        _mm_store_ps(value, res1);
         OutBuffer[i] = value[0] + value[1] + value[2] + value[3];
 
         frac += increment;
-- 
2.11.4.GIT