From 45d6bb58a4293c5b1ab229cea86e0ef24a2a084b Mon Sep 17 00:00:00 2001
From: Chris Robinson <chris.kcat@gmail.com>
Date: Sun, 23 Nov 2014 10:49:54 -0800
Subject: [PATCH] Partially revert "Use a different method for HRTF mixing"

The sound localization with virtual channel mixing was just too poor, so while
it's more costly to do per-source HRTF mixing, it's unavoidable if you want
good localization.

This is only partially reverted because having the virtual channel is still
beneficial, particularly with B-Format rendering and effect mixing which
otherwise skip HRTF processing. As before, the number of virtual channels can
potentially be customized, specifying more or less channels depending on the
system's needs.
---
 Alc/ALu.c                 | 106 ++++++++++++++++++++++-
 Alc/hrtf.c                | 214 +++++++++++++++++++++++++++++++++++++++++++---
 Alc/hrtf.h                |   4 +-
 Alc/mixer.c               |  25 +++++-
 Alc/mixer_c.c             |  17 ++++
 Alc/mixer_defs.h          |  15 ++--
 Alc/mixer_inc.c           |  48 +++++++++--
 Alc/mixer_neon.c          |  32 +++++++
 Alc/mixer_sse.c           |  62 ++++++++++++++
 Alc/panning.c             |   6 +-
 OpenAL32/Include/alMain.h |   2 +
 OpenAL32/Include/alu.h    |  10 ++-
 OpenAL32/alSource.c       |  11 +++
 13 files changed, 520 insertions(+), 32 deletions(-)

diff --git a/Alc/ALu.c b/Alc/ALu.c
index f7408da0..c4292833 100644
--- a/Alc/ALu.c
+++ b/Alc/ALu.c
@@ -527,6 +527,39 @@ ALvoid CalcNonAttnSourceParams(ALvoice *voice, const ALsource *ALSource, const A
 
         voice->IsHrtf = AL_FALSE;
     }
+    else if(Device->Hrtf)
+    {
+        voice->Direct.OutBuffer = &voice->Direct.OutBuffer[voice->Direct.OutChannels];
+        voice->Direct.OutChannels = 2;
+        for(c = 0;c < num_channels;c++)
+        {
+            if(chans[c].channel == LFE)
+            {
+                /* Skip LFE */
+                voice->Direct.Hrtf.Params[c].Delay[0] = 0;
+                voice->Direct.Hrtf.Params[c].Delay[1] = 0;
+                for(i = 0;i < HRIR_LENGTH;i++)
+                {
+                    voice->Direct.Hrtf.Params[c].Coeffs[i][0] = 0.0f;
+                    voice->Direct.Hrtf.Params[c].Coeffs[i][1] = 0.0f;
+                }
+            }
+            else
+            {
+                /* Get the static HRIR coefficients and delays for this
+                 * channel. */
+                GetLerpedHrtfCoeffs(Device->Hrtf,
+                                    chans[c].elevation, chans[c].angle, 1.0f, DryGain,
+                                    voice->Direct.Hrtf.Params[c].Coeffs,
+                                    voice->Direct.Hrtf.Params[c].Delay);
+            }
+        }
+        voice->Direct.Counter = 0;
+        voice->Direct.Moving  = AL_TRUE;
+        voice->Direct.Hrtf.IrSize = GetHrtfIrSize(Device->Hrtf);
+
+        voice->IsHrtf = AL_TRUE;
+    }
     else
     {
         for(c = 0;c < num_channels;c++)
@@ -934,6 +967,73 @@ ALvoid CalcSourceParams(ALvoice *voice, const ALsource *ALSource, const ALCconte
         BufferListItem = BufferListItem->next;
     }
 
+    if(Device->Hrtf)
+    {
+        /* Use a binaural HRTF algorithm for stereo headphone playback */
+        ALfloat delta, ev = 0.0f, az = 0.0f;
+        ALfloat radius = ALSource->Radius;
+        ALfloat dirfact = 1.0f;
+
+        voice->Direct.OutBuffer = &voice->Direct.OutBuffer[voice->Direct.OutChannels];
+        voice->Direct.OutChannels = 2;
+
+        if(Distance > FLT_EPSILON)
+        {
+            ALfloat invlen = 1.0f/Distance;
+            Position[0] *= invlen;
+            Position[1] *= invlen;
+            Position[2] *= invlen;
+
+            /* Calculate elevation and azimuth only when the source is not at
+             * the listener. This prevents +0 and -0 Z from producing
+             * inconsistent panning. Also, clamp Y in case FP precision errors
+             * cause it to land outside of -1..+1. */
+            ev = asinf(clampf(Position[1], -1.0f, 1.0f));
+            az = atan2f(Position[0], -Position[2]*ZScale);
+        }
+        if(radius > Distance)
+            dirfact *= Distance / radius;
+
+        /* Check to see if the HRIR is already moving. */
+        if(voice->Direct.Moving)
+        {
+            /* Calculate the normalized HRTF transition factor (delta). */
+            delta = CalcHrtfDelta(voice->Direct.Hrtf.Gain, DryGain,
+                                  voice->Direct.Hrtf.Dir, Position);
+            /* If the delta is large enough, get the moving HRIR target
+             * coefficients, target delays, steppping values, and counter. */
+            if(delta > 0.001f)
+            {
+                ALuint counter = GetMovingHrtfCoeffs(Device->Hrtf,
+                    ev, az, dirfact, DryGain, delta, voice->Direct.Counter,
+                    voice->Direct.Hrtf.Params[0].Coeffs, voice->Direct.Hrtf.Params[0].Delay,
+                    voice->Direct.Hrtf.Params[0].CoeffStep, voice->Direct.Hrtf.Params[0].DelayStep
+                );
+                voice->Direct.Counter = counter;
+                voice->Direct.Hrtf.Gain = DryGain;
+                voice->Direct.Hrtf.Dir[0] = Position[0];
+                voice->Direct.Hrtf.Dir[1] = Position[1];
+                voice->Direct.Hrtf.Dir[2] = Position[2];
+            }
+        }
+        else
+        {
+            /* Get the initial (static) HRIR coefficients and delays. */
+            GetLerpedHrtfCoeffs(Device->Hrtf, ev, az, dirfact, DryGain,
+                                voice->Direct.Hrtf.Params[0].Coeffs,
+                                voice->Direct.Hrtf.Params[0].Delay);
+            voice->Direct.Counter = 0;
+            voice->Direct.Moving  = AL_TRUE;
+            voice->Direct.Hrtf.Gain = DryGain;
+            voice->Direct.Hrtf.Dir[0] = Position[0];
+            voice->Direct.Hrtf.Dir[1] = Position[1];
+            voice->Direct.Hrtf.Dir[2] = Position[2];
+        }
+        voice->Direct.Hrtf.IrSize = GetHrtfIrSize(Device->Hrtf);
+
+        voice->IsHrtf = AL_TRUE;
+    }
+    else
     {
         MixGains *gains = voice->Direct.Gains[0];
         ALfloat radius = ALSource->Radius;
@@ -1168,8 +1268,10 @@ ALvoid aluMixData(ALCdevice *device, ALvoid *buffer, ALsizei size)
             HrtfMixerFunc HrtfMix = SelectHrtfMixer();
             ALuint irsize = GetHrtfIrSize(device->Hrtf);
             for(c = 0;c < device->NumChannels;c++)
-                HrtfMix(&device->DryBuffer[outchanoffset], device->DryBuffer[c], device->Hrtf_Offset, irsize,
-                        &device->Hrtf_Params[c], &device->Hrtf_State[c], SamplesToDo);
+                HrtfMix(&device->DryBuffer[outchanoffset], device->DryBuffer[c], 0.0f,
+                    device->Hrtf_Offset, 0.0f, irsize, &device->Hrtf_Params[c],
+                    &device->Hrtf_State[c], SamplesToDo
+                );
             device->Hrtf_Offset += SamplesToDo;
         }
         else if(device->Bs2b)
diff --git a/Alc/hrtf.c b/Alc/hrtf.c
index 2e4156a0..1e371fa4 100644
--- a/Alc/hrtf.c
+++ b/Alc/hrtf.c
@@ -58,6 +58,10 @@ struct Hrtf {
 static const ALchar magicMarker00[8] = "MinPHR00";
 static const ALchar magicMarker01[8] = "MinPHR01";
 
+/* First value for pass-through coefficients (remaining are 0), used for omni-
+ * directional sounds. */
+static const ALfloat PassthruCoeff = 32767.0f * 0.707106781187f/*sqrt(0.5)*/;
+
 static struct Hrtf *LoadedHrtfs = NULL;
 
 /* Calculate the elevation indices given the polar elevation in radians.
@@ -84,12 +88,45 @@ static void CalcAzIndices(ALuint azcount, ALfloat az, ALuint *azidx, ALfloat *az
     *azmu = az - floorf(az);
 }
 
+/* Calculates the normalized HRTF transition factor (delta) from the changes
+ * in gain and listener to source angle between updates.  The result is a
+ * normalized delta factor that can be used to calculate moving HRIR stepping
+ * values.
+ */
+ALfloat CalcHrtfDelta(ALfloat oldGain, ALfloat newGain, const ALfloat olddir[3], const ALfloat newdir[3])
+{
+    ALfloat gainChange, angleChange, change;
+
+    // Calculate the normalized dB gain change.
+    newGain = maxf(newGain, 0.0001f);
+    oldGain = maxf(oldGain, 0.0001f);
+    gainChange = fabsf(log10f(newGain / oldGain) / log10f(0.0001f));
+
+    // Calculate the angle change only when there is enough gain to notice it.
+    angleChange = 0.0f;
+    if(gainChange > 0.0001f || newGain > 0.0001f)
+    {
+        // No angle change when the directions are equal or degenerate (when
+        // both have zero length).
+        if(newdir[0] != olddir[0] || newdir[1] != olddir[1] || newdir[2] != olddir[2])
+        {
+            ALfloat dotp = olddir[0]*newdir[0] + olddir[1]*newdir[1] + olddir[2]*newdir[2];
+            angleChange = acosf(clampf(dotp, -1.0f, 1.0f)) / F_PI;
+        }
+    }
+
+    // Use the largest of the two changes for the delta factor, and apply a
+    // significance shaping function to it.
+    change = maxf(angleChange * 25.0f, gainChange) * 2.0f;
+    return minf(change, 1.0f);
+}
+
 /* Calculates static HRIR coefficients and delays for the given polar
  * elevation and azimuth in radians.  Linear interpolation is used to
  * increase the apparent resolution of the HRIR data set.  The coefficients
  * are also normalized and attenuated by the specified gain.
  */
-void GetLerpedHrtfCoeffs(const struct Hrtf *Hrtf, ALfloat elevation, ALfloat azimuth, ALfloat (*coeffs)[2], ALuint *delays)
+void GetLerpedHrtfCoeffs(const struct Hrtf *Hrtf, ALfloat elevation, ALfloat azimuth, ALfloat dirfact, ALfloat gain, ALfloat (*coeffs)[2], ALuint *delays)
 {
     ALuint evidx[2], lidx[4], ridx[4];
     ALfloat mu[3], blend[4];
@@ -121,12 +158,12 @@ void GetLerpedHrtfCoeffs(const struct Hrtf *Hrtf, ALfloat elevation, ALfloat azi
     blend[3] = (     mu[1]) * (     mu[2]);
 
     /* Calculate the HRIR delays using linear interpolation. */
-    delays[0] = fastf2u(Hrtf->delays[lidx[0]]*blend[0] + Hrtf->delays[lidx[1]]*blend[1] +
-                        Hrtf->delays[lidx[2]]*blend[2] + Hrtf->delays[lidx[3]]*blend[3] +
-                        0.5f);
-    delays[1] = fastf2u(Hrtf->delays[ridx[0]]*blend[0] + Hrtf->delays[ridx[1]]*blend[1] +
-                        Hrtf->delays[ridx[2]]*blend[2] + Hrtf->delays[ridx[3]]*blend[3] +
-                        0.5f);
+    delays[0] = fastf2u((Hrtf->delays[lidx[0]]*blend[0] + Hrtf->delays[lidx[1]]*blend[1] +
+                         Hrtf->delays[lidx[2]]*blend[2] + Hrtf->delays[lidx[3]]*blend[3]) *
+                        dirfact + 0.5f) << HRTFDELAY_BITS;
+    delays[1] = fastf2u((Hrtf->delays[ridx[0]]*blend[0] + Hrtf->delays[ridx[1]]*blend[1] +
+                         Hrtf->delays[ridx[2]]*blend[2] + Hrtf->delays[ridx[3]]*blend[3]) *
+                        dirfact + 0.5f) << HRTFDELAY_BITS;
 
     /* Calculate the sample offsets for the HRIR indices. */
     lidx[0] *= Hrtf->irSize;
@@ -138,16 +175,173 @@ void GetLerpedHrtfCoeffs(const struct Hrtf *Hrtf, ALfloat elevation, ALfloat azi
     ridx[2] *= Hrtf->irSize;
     ridx[3] *= Hrtf->irSize;
 
-    for(i = 0;i < Hrtf->irSize;i++)
+    /* Calculate the normalized and attenuated HRIR coefficients using linear
+     * interpolation when there is enough gain to warrant it.  Zero the
+     * coefficients if gain is too low.
+     */
+    if(gain > 0.0001f)
     {
         ALfloat c;
+
+        i = 0;
         c = (Hrtf->coeffs[lidx[0]+i]*blend[0] + Hrtf->coeffs[lidx[1]+i]*blend[1] +
              Hrtf->coeffs[lidx[2]+i]*blend[2] + Hrtf->coeffs[lidx[3]+i]*blend[3]);
-        coeffs[i][0] = c * (1.0f/32767.0f);
+        coeffs[i][0] = lerp(PassthruCoeff, c, dirfact) * gain * (1.0f/32767.0f);
         c = (Hrtf->coeffs[ridx[0]+i]*blend[0] + Hrtf->coeffs[ridx[1]+i]*blend[1] +
              Hrtf->coeffs[ridx[2]+i]*blend[2] + Hrtf->coeffs[ridx[3]+i]*blend[3]);
-        coeffs[i][1] = c * (1.0f/32767.0f);
+        coeffs[i][1] = lerp(PassthruCoeff, c, dirfact) * gain * (1.0f/32767.0f);
+
+        for(i = 1;i < Hrtf->irSize;i++)
+        {
+            c = (Hrtf->coeffs[lidx[0]+i]*blend[0] + Hrtf->coeffs[lidx[1]+i]*blend[1] +
+                 Hrtf->coeffs[lidx[2]+i]*blend[2] + Hrtf->coeffs[lidx[3]+i]*blend[3]);
+            coeffs[i][0] = lerp(0.0f, c, dirfact) * gain * (1.0f/32767.0f);
+            c = (Hrtf->coeffs[ridx[0]+i]*blend[0] + Hrtf->coeffs[ridx[1]+i]*blend[1] +
+                 Hrtf->coeffs[ridx[2]+i]*blend[2] + Hrtf->coeffs[ridx[3]+i]*blend[3]);
+            coeffs[i][1] = lerp(0.0f, c, dirfact) * gain * (1.0f/32767.0f);
+        }
     }
+    else
+    {
+        for(i = 0;i < Hrtf->irSize;i++)
+        {
+            coeffs[i][0] = 0.0f;
+            coeffs[i][1] = 0.0f;
+        }
+    }
+}
+
+/* Calculates the moving HRIR target coefficients, target delays, and
+ * stepping values for the given polar elevation and azimuth in radians.
+ * Linear interpolation is used to increase the apparent resolution of the
+ * HRIR data set.  The coefficients are also normalized and attenuated by the
+ * specified gain.  Stepping resolution and count is determined using the
+ * given delta factor between 0.0 and 1.0.
+ */
+ALuint GetMovingHrtfCoeffs(const struct Hrtf *Hrtf, ALfloat elevation, ALfloat azimuth, ALfloat dirfact, ALfloat gain, ALfloat delta, ALint counter, ALfloat (*coeffs)[2], ALuint *delays, ALfloat (*coeffStep)[2], ALint *delayStep)
+{
+    ALuint evidx[2], lidx[4], ridx[4];
+    ALfloat mu[3], blend[4];
+    ALfloat left, right;
+    ALfloat step;
+    ALuint i;
+
+    /* Claculate elevation indices and interpolation factor. */
+    CalcEvIndices(Hrtf->evCount, elevation, evidx, &mu[2]);
+
+    for(i = 0;i < 2;i++)
+    {
+        ALuint azcount = Hrtf->azCount[evidx[i]];
+        ALuint evoffset = Hrtf->evOffset[evidx[i]];
+        ALuint azidx[2];
+
+        /* Calculate azimuth indices and interpolation factor for this elevation. */
+        CalcAzIndices(azcount, azimuth, azidx, &mu[i]);
+
+        /* Calculate a set of linear HRIR indices for left and right channels. */
+        lidx[i*2 + 0] = evoffset + azidx[0];
+        lidx[i*2 + 1] = evoffset + azidx[1];
+        ridx[i*2 + 0] = evoffset + ((azcount-azidx[0]) % azcount);
+        ridx[i*2 + 1] = evoffset + ((azcount-azidx[1]) % azcount);
+    }
+
+    // Calculate the stepping parameters.
+    delta = maxf(floorf(delta*(Hrtf->sampleRate*0.015f) + 0.5f), 1.0f);
+    step = 1.0f / delta;
+
+    /* Calculate 4 blending weights for 2D bilinear interpolation. */
+    blend[0] = (1.0f-mu[0]) * (1.0f-mu[2]);
+    blend[1] = (     mu[0]) * (1.0f-mu[2]);
+    blend[2] = (1.0f-mu[1]) * (     mu[2]);
+    blend[3] = (     mu[1]) * (     mu[2]);
+
+    /* Calculate the HRIR delays using linear interpolation.  Then calculate
+     * the delay stepping values using the target and previous running
+     * delays.
+     */
+    left = (ALfloat)(delays[0] - (delayStep[0] * counter));
+    right = (ALfloat)(delays[1] - (delayStep[1] * counter));
+
+    delays[0] = fastf2u((Hrtf->delays[lidx[0]]*blend[0] + Hrtf->delays[lidx[1]]*blend[1] +
+                         Hrtf->delays[lidx[2]]*blend[2] + Hrtf->delays[lidx[3]]*blend[3]) *
+                        dirfact + 0.5f) << HRTFDELAY_BITS;
+    delays[1] = fastf2u((Hrtf->delays[ridx[0]]*blend[0] + Hrtf->delays[ridx[1]]*blend[1] +
+                         Hrtf->delays[ridx[2]]*blend[2] + Hrtf->delays[ridx[3]]*blend[3]) *
+                        dirfact + 0.5f) << HRTFDELAY_BITS;
+
+    delayStep[0] = fastf2i(step * (delays[0] - left));
+    delayStep[1] = fastf2i(step * (delays[1] - right));
+
+    /* Calculate the sample offsets for the HRIR indices. */
+    lidx[0] *= Hrtf->irSize;
+    lidx[1] *= Hrtf->irSize;
+    lidx[2] *= Hrtf->irSize;
+    lidx[3] *= Hrtf->irSize;
+    ridx[0] *= Hrtf->irSize;
+    ridx[1] *= Hrtf->irSize;
+    ridx[2] *= Hrtf->irSize;
+    ridx[3] *= Hrtf->irSize;
+
+    /* Calculate the normalized and attenuated target HRIR coefficients using
+     * linear interpolation when there is enough gain to warrant it.  Zero
+     * the target coefficients if gain is too low.  Then calculate the
+     * coefficient stepping values using the target and previous running
+     * coefficients.
+     */
+    if(gain > 0.0001f)
+    {
+        ALfloat c;
+
+        i = 0;
+        left = coeffs[i][0] - (coeffStep[i][0] * counter);
+        right = coeffs[i][1] - (coeffStep[i][1] * counter);
+
+        c = (Hrtf->coeffs[lidx[0]+i]*blend[0] + Hrtf->coeffs[lidx[1]+i]*blend[1] +
+             Hrtf->coeffs[lidx[2]+i]*blend[2] + Hrtf->coeffs[lidx[3]+i]*blend[3]);
+        coeffs[i][0] = lerp(PassthruCoeff, c, dirfact) * gain * (1.0f/32767.0f);;
+        c = (Hrtf->coeffs[ridx[0]+i]*blend[0] + Hrtf->coeffs[ridx[1]+i]*blend[1] +
+             Hrtf->coeffs[ridx[2]+i]*blend[2] + Hrtf->coeffs[ridx[3]+i]*blend[3]);
+        coeffs[i][1] = lerp(PassthruCoeff, c, dirfact) * gain * (1.0f/32767.0f);;
+
+        coeffStep[i][0] = step * (coeffs[i][0] - left);
+        coeffStep[i][1] = step * (coeffs[i][1] - right);
+
+        for(i = 1;i < Hrtf->irSize;i++)
+        {
+            left = coeffs[i][0] - (coeffStep[i][0] * counter);
+            right = coeffs[i][1] - (coeffStep[i][1] * counter);
+
+            c = (Hrtf->coeffs[lidx[0]+i]*blend[0] + Hrtf->coeffs[lidx[1]+i]*blend[1] +
+                 Hrtf->coeffs[lidx[2]+i]*blend[2] + Hrtf->coeffs[lidx[3]+i]*blend[3]);
+            coeffs[i][0] = lerp(0.0f, c, dirfact) * gain * (1.0f/32767.0f);;
+            c = (Hrtf->coeffs[ridx[0]+i]*blend[0] + Hrtf->coeffs[ridx[1]+i]*blend[1] +
+                 Hrtf->coeffs[ridx[2]+i]*blend[2] + Hrtf->coeffs[ridx[3]+i]*blend[3]);
+            coeffs[i][1] = lerp(0.0f, c, dirfact) * gain * (1.0f/32767.0f);;
+
+            coeffStep[i][0] = step * (coeffs[i][0] - left);
+            coeffStep[i][1] = step * (coeffs[i][1] - right);
+        }
+    }
+    else
+    {
+        for(i = 0;i < Hrtf->irSize;i++)
+        {
+            left = coeffs[i][0] - (coeffStep[i][0] * counter);
+            right = coeffs[i][1] - (coeffStep[i][1] * counter);
+
+            coeffs[i][0] = 0.0f;
+            coeffs[i][1] = 0.0f;
+
+            coeffStep[i][0] = step * -left;
+            coeffStep[i][1] = step * -right;
+        }
+    }
+
+    /* The stepping count is the number of samples necessary for the HRIR to
+     * complete its transition.  The mixer will only apply stepping for this
+     * many samples.
+     */
+    return fastf2u(delta);
 }
 
 
diff --git a/Alc/hrtf.h b/Alc/hrtf.h
index 48636344..938bf552 100644
--- a/Alc/hrtf.h
+++ b/Alc/hrtf.h
@@ -21,6 +21,8 @@ ALCboolean FindHrtfFormat(enum DevFmtChannels *chans, ALCuint *srate);
 void FreeHrtfs(void);
 
 ALuint GetHrtfIrSize(const struct Hrtf *Hrtf);
-void GetLerpedHrtfCoeffs(const struct Hrtf *Hrtf, ALfloat elevation, ALfloat azimuth, ALfloat (*coeffs)[2], ALuint *delays);
+ALfloat CalcHrtfDelta(ALfloat oldGain, ALfloat newGain, const ALfloat olddir[3], const ALfloat newdir[3]);
+void GetLerpedHrtfCoeffs(const struct Hrtf *Hrtf, ALfloat elevation, ALfloat azimuth, ALfloat dirfact, ALfloat gain, ALfloat (*coeffs)[2], ALuint *delays);
+ALuint GetMovingHrtfCoeffs(const struct Hrtf *Hrtf, ALfloat elevation, ALfloat azimuth, ALfloat dirfact, ALfloat gain, ALfloat delta, ALint counter, ALfloat (*coeffs)[2], ALuint *delays, ALfloat (*coeffStep)[2], ALint *delayStep);
 
 #endif /* ALC_HRTF_H */
diff --git a/Alc/mixer.c b/Alc/mixer.c
index 4a98ee8f..3f80434e 100644
--- a/Alc/mixer.c
+++ b/Alc/mixer.c
@@ -41,6 +41,20 @@
 extern inline void InitiatePositionArrays(ALuint frac, ALuint increment, ALuint *frac_arr, ALuint *pos_arr, ALuint size);
 
 
+static inline HrtfMixerFunc SelectHrtfMixer(void)
+{
+#ifdef HAVE_SSE
+    if((CPUCapFlags&CPU_CAP_SSE))
+        return MixHrtf_SSE;
+#endif
+#ifdef HAVE_NEON
+    if((CPUCapFlags&CPU_CAP_NEON))
+        return MixHrtf_Neon;
+#endif
+
+    return MixHrtf_C;
+}
+
 static inline MixerFunc SelectMixer(void)
 {
 #ifdef HAVE_SSE
@@ -165,6 +179,7 @@ static const ALfloat *DoFilters(ALfilterState *lpfilter, ALfilterState *hpfilter
 ALvoid MixSource(ALvoice *voice, ALsource *Source, ALCdevice *Device, ALuint SamplesToDo)
 {
     MixerFunc Mix;
+    HrtfMixerFunc HrtfMix;
     ResamplerFunc Resample;
     ALbufferlistitem *BufferListItem;
     ALuint DataPosInt, DataPosFrac;
@@ -203,6 +218,7 @@ ALvoid MixSource(ALvoice *voice, ALsource *Source, ALCdevice *Device, ALuint Sam
     }
 
     Mix = SelectMixer();
+    HrtfMix = SelectHrtfMixer();
     Resample = ((increment == FRACTIONONE && DataPosFrac == 0) ?
                 Resample_copy32_C : SelectResampler(Resampler));
 
@@ -415,8 +431,13 @@ ALvoid MixSource(ALvoice *voice, ALsource *Source, ALCdevice *Device, ALuint Sam
                     Device->FilteredData, ResampledData, DstBufferSize,
                     parms->Filters[chan].ActiveType
                 );
-                Mix(samples, parms->OutChannels, parms->OutBuffer, parms->Gains[chan],
-                    parms->Counter, OutPos, DstBufferSize);
+                if(!voice->IsHrtf)
+                    Mix(samples, parms->OutChannels, parms->OutBuffer, parms->Gains[chan],
+                        parms->Counter, OutPos, DstBufferSize);
+                else
+                    HrtfMix(parms->OutBuffer, samples, parms->Counter, voice->Offset,
+                            OutPos, parms->Hrtf.IrSize, &parms->Hrtf.Params[chan],
+                            &parms->Hrtf.State[chan], DstBufferSize);
             }
 
             /* Only the first channel for B-Format buffers (W channel) goes to
diff --git a/Alc/mixer_c.c b/Alc/mixer_c.c
index caedd339..0fdcc087 100644
--- a/Alc/mixer_c.c
+++ b/Alc/mixer_c.c
@@ -59,6 +59,23 @@ void ALfilterState_processC(ALfilterState *filter, ALfloat *restrict dst, const
 }
 
 
+static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
+                                   const ALuint IrSize,
+                                   ALfloat (*restrict Coeffs)[2],
+                                   const ALfloat (*restrict CoeffStep)[2],
+                                   ALfloat left, ALfloat right)
+{
+    ALuint c;
+    for(c = 0;c < IrSize;c++)
+    {
+        const ALuint off = (Offset+c)&HRIR_MASK;
+        Values[off][0] += Coeffs[c][0] * left;
+        Values[off][1] += Coeffs[c][1] * right;
+        Coeffs[c][0] += CoeffStep[c][0];
+        Coeffs[c][1] += CoeffStep[c][1];
+    }
+}
+
 static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
                                const ALuint IrSize,
                                ALfloat (*restrict Coeffs)[2],
diff --git a/Alc/mixer_defs.h b/Alc/mixer_defs.h
index 62dad9dc..c1500ed2 100644
--- a/Alc/mixer_defs.h
+++ b/Alc/mixer_defs.h
@@ -20,15 +20,17 @@ const ALfloat *Resample_cubic32_C(const ALfloat *src, ALuint frac, ALuint increm
 
 /* C mixers */
 void MixHrtf_C(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
-               ALuint Offset, const ALuint IrSize, const struct HrtfParams *hrtfparams,
-               struct HrtfState *hrtfstate, ALuint BufferSize);
+               ALuint Counter, ALuint Offset, ALuint OutPos, const ALuint IrSize,
+               const struct HrtfParams *hrtfparams, struct HrtfState *hrtfstate,
+               ALuint BufferSize);
 void Mix_C(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE],
                  struct MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize);
 
 /* SSE mixers */
 void MixHrtf_SSE(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
-                 ALuint Offset, const ALuint IrSize, const struct HrtfParams *hrtfparams,
-                 struct HrtfState *hrtfstate, ALuint BufferSize);
+                 ALuint Counter, ALuint Offset, ALuint OutPos, const ALuint IrSize,
+                 const struct HrtfParams *hrtfparams, struct HrtfState *hrtfstate,
+                 ALuint BufferSize);
 void Mix_SSE(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE],
              struct MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize);
 
@@ -54,8 +56,9 @@ const ALfloat *Resample_lerp32_SSE41(const ALfloat *src, ALuint frac, ALuint inc
 
 /* Neon mixers */
 void MixHrtf_Neon(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
-                  ALuint Offset, const ALuint IrSize, const struct HrtfParams *hrtfparams,
-                  struct HrtfState *hrtfstate, ALuint BufferSize);
+                  ALuint Counter, ALuint Offset, ALuint OutPos, const ALuint IrSize,
+                  const struct HrtfParams *hrtfparams, struct HrtfState *hrtfstate,
+                  ALuint BufferSize);
 void Mix_Neon(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE],
               struct MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize);
 
diff --git a/Alc/mixer_inc.c b/Alc/mixer_inc.c
index 46ccec7d..b4635b43 100644
--- a/Alc/mixer_inc.c
+++ b/Alc/mixer_inc.c
@@ -14,6 +14,11 @@
 #define MixHrtf MERGE(MixHrtf_,SUFFIX)
 
 
+static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
+                                   const ALuint irSize,
+                                   ALfloat (*restrict Coeffs)[2],
+                                   const ALfloat (*restrict CoeffStep)[2],
+                                   ALfloat left, ALfloat right);
 static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
                                const ALuint irSize,
                                ALfloat (*restrict Coeffs)[2],
@@ -21,7 +26,7 @@ static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
 
 
 void MixHrtf(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
-             ALuint Offset, const ALuint IrSize,
+             ALuint Counter, ALuint Offset, ALuint OutPos, const ALuint IrSize,
              const HrtfParams *hrtfparams, HrtfState *hrtfstate, ALuint BufferSize)
 {
     alignas(16) ALfloat Coeffs[HRIR_LENGTH][2];
@@ -32,13 +37,39 @@ void MixHrtf(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
 
     for(c = 0;c < IrSize;c++)
     {
-        Coeffs[c][0] = hrtfparams->Coeffs[c][0];
-        Coeffs[c][1] = hrtfparams->Coeffs[c][1];
+        Coeffs[c][0] = hrtfparams->Coeffs[c][0] - (hrtfparams->CoeffStep[c][0]*Counter);
+        Coeffs[c][1] = hrtfparams->Coeffs[c][1] - (hrtfparams->CoeffStep[c][1]*Counter);
     }
-    Delay[0] = hrtfparams->Delay[0];
-    Delay[1] = hrtfparams->Delay[1];
+    Delay[0] = hrtfparams->Delay[0] - (hrtfparams->DelayStep[0]*Counter);
+    Delay[1] = hrtfparams->Delay[1] - (hrtfparams->DelayStep[1]*Counter);
 
-    for(pos = 0;pos < BufferSize;pos++)
+    pos = 0;
+    for(;pos < BufferSize && pos < Counter;pos++)
+    {
+        hrtfstate->History[Offset&HRTF_HISTORY_MASK] = data[pos];
+        left  = lerp(hrtfstate->History[(Offset-(Delay[0]>>HRTFDELAY_BITS))&HRTF_HISTORY_MASK],
+                     hrtfstate->History[(Offset-(Delay[0]>>HRTFDELAY_BITS)-1)&HRTF_HISTORY_MASK],
+                     (Delay[0]&HRTFDELAY_MASK)*(1.0f/HRTFDELAY_FRACONE));
+        right = lerp(hrtfstate->History[(Offset-(Delay[1]>>HRTFDELAY_BITS))&HRTF_HISTORY_MASK],
+                     hrtfstate->History[(Offset-(Delay[1]>>HRTFDELAY_BITS)-1)&HRTF_HISTORY_MASK],
+                     (Delay[1]&HRTFDELAY_MASK)*(1.0f/HRTFDELAY_FRACONE));
+
+        Delay[0] += hrtfparams->DelayStep[0];
+        Delay[1] += hrtfparams->DelayStep[1];
+
+        hrtfstate->Values[(Offset+IrSize)&HRIR_MASK][0] = 0.0f;
+        hrtfstate->Values[(Offset+IrSize)&HRIR_MASK][1] = 0.0f;
+        Offset++;
+
+        ApplyCoeffsStep(Offset, hrtfstate->Values, IrSize, Coeffs, hrtfparams->CoeffStep, left, right);
+        OutBuffer[0][OutPos] += hrtfstate->Values[Offset&HRIR_MASK][0];
+        OutBuffer[1][OutPos] += hrtfstate->Values[Offset&HRIR_MASK][1];
+        OutPos++;
+    }
+
+    Delay[0] >>= HRTFDELAY_BITS;
+    Delay[1] >>= HRTFDELAY_BITS;
+    for(;pos < BufferSize;pos++)
     {
         hrtfstate->History[Offset&HRTF_HISTORY_MASK] = data[pos];
         left = hrtfstate->History[(Offset-Delay[0])&HRTF_HISTORY_MASK];
@@ -49,8 +80,9 @@ void MixHrtf(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
         Offset++;
 
         ApplyCoeffs(Offset, hrtfstate->Values, IrSize, Coeffs, left, right);
-        OutBuffer[0][pos] += hrtfstate->Values[Offset&HRIR_MASK][0];
-        OutBuffer[1][pos] += hrtfstate->Values[Offset&HRIR_MASK][1];
+        OutBuffer[0][OutPos] += hrtfstate->Values[Offset&HRIR_MASK][0];
+        OutBuffer[1][OutPos] += hrtfstate->Values[Offset&HRIR_MASK][1];
+        OutPos++;
     }
 }
 
diff --git a/Alc/mixer_neon.c b/Alc/mixer_neon.c
index 413bd627..8ce17644 100644
--- a/Alc/mixer_neon.c
+++ b/Alc/mixer_neon.c
@@ -9,6 +9,38 @@
 #include "hrtf.h"
 
 
+static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
+                                   const ALuint IrSize,
+                                   ALfloat (*restrict Coeffs)[2],
+                                   const ALfloat (*restrict CoeffStep)[2],
+                                   ALfloat left, ALfloat right)
+{
+    ALuint c;
+    float32x4_t leftright4;
+    {
+        float32x2_t leftright2 = vdup_n_f32(0.0);
+        leftright2 = vset_lane_f32(left, leftright2, 0);
+        leftright2 = vset_lane_f32(right, leftright2, 1);
+        leftright4 = vcombine_f32(leftright2, leftright2);
+    }
+    for(c = 0;c < IrSize;c += 2)
+    {
+        const ALuint o0 = (Offset+c)&HRIR_MASK;
+        const ALuint o1 = (o0+1)&HRIR_MASK;
+        float32x4_t vals = vcombine_f32(vld1_f32((float32_t*)&Values[o0][0]),
+                                        vld1_f32((float32_t*)&Values[o1][0]));
+        float32x4_t coefs = vld1q_f32((float32_t*)&Coeffs[c][0]);
+        float32x4_t deltas = vld1q_f32(&CoeffStep[c][0]);
+
+        vals = vmlaq_f32(vals, coefs, leftright4);
+        coefs = vaddq_f32(coefs, deltas);
+
+        vst1_f32((float32_t*)&Values[o0][0], vget_low_f32(vals));
+        vst1_f32((float32_t*)&Values[o1][0], vget_high_f32(vals));
+        vst1q_f32(&Coeffs[c][0], coefs);
+    }
+}
+
 static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
                                const ALuint IrSize,
                                ALfloat (*restrict Coeffs)[2],
diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c
index d0dca40e..d86cf749 100644
--- a/Alc/mixer_sse.c
+++ b/Alc/mixer_sse.c
@@ -19,6 +19,68 @@
 #include "mixer_defs.h"
 
 
+static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
+                                   const ALuint IrSize,
+                                   ALfloat (*restrict Coeffs)[2],
+                                   const ALfloat (*restrict CoeffStep)[2],
+                                   ALfloat left, ALfloat right)
+{
+    const __m128 lrlr = _mm_setr_ps(left, right, left, right);
+    __m128 coeffs, deltas, imp0, imp1;
+    __m128 vals = _mm_setzero_ps();
+    ALuint i;
+
+    if((Offset&1))
+    {
+        const ALuint o0 = Offset&HRIR_MASK;
+        const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
+
+        coeffs = _mm_load_ps(&Coeffs[0][0]);
+        deltas = _mm_load_ps(&CoeffStep[0][0]);
+        vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
+        imp0 = _mm_mul_ps(lrlr, coeffs);
+        coeffs = _mm_add_ps(coeffs, deltas);
+        vals = _mm_add_ps(imp0, vals);
+        _mm_store_ps(&Coeffs[0][0], coeffs);
+        _mm_storel_pi((__m64*)&Values[o0][0], vals);
+        for(i = 1;i < IrSize-1;i += 2)
+        {
+            const ALuint o2 = (Offset+i)&HRIR_MASK;
+
+            coeffs = _mm_load_ps(&Coeffs[i+1][0]);
+            deltas = _mm_load_ps(&CoeffStep[i+1][0]);
+            vals = _mm_load_ps(&Values[o2][0]);
+            imp1 = _mm_mul_ps(lrlr, coeffs);
+            coeffs = _mm_add_ps(coeffs, deltas);
+            imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
+            vals = _mm_add_ps(imp0, vals);
+            _mm_store_ps(&Coeffs[i+1][0], coeffs);
+            _mm_store_ps(&Values[o2][0], vals);
+            imp0 = imp1;
+        }
+        vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
+        imp0 = _mm_movehl_ps(imp0, imp0);
+        vals = _mm_add_ps(imp0, vals);
+        _mm_storel_pi((__m64*)&Values[o1][0], vals);
+    }
+    else
+    {
+        for(i = 0;i < IrSize;i += 2)
+        {
+            const ALuint o = (Offset + i)&HRIR_MASK;
+
+            coeffs = _mm_load_ps(&Coeffs[i][0]);
+            deltas = _mm_load_ps(&CoeffStep[i][0]);
+            vals = _mm_load_ps(&Values[o][0]);
+            imp0 = _mm_mul_ps(lrlr, coeffs);
+            coeffs = _mm_add_ps(coeffs, deltas);
+            vals = _mm_add_ps(imp0, vals);
+            _mm_store_ps(&Coeffs[i][0], coeffs);
+            _mm_store_ps(&Values[o][0], vals);
+        }
+    }
+}
+
 static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
                                const ALuint IrSize,
                                ALfloat (*restrict Coeffs)[2],
diff --git a/Alc/panning.c b/Alc/panning.c
index 64be6f4b..81398b1b 100644
--- a/Alc/panning.c
+++ b/Alc/panning.c
@@ -387,8 +387,10 @@ ALvoid aluInitPanning(ALCdevice *device)
             device->ChannelName[i] = VirtualChans[i].channel;
         SetChannelMap(device, chanmap, count);
         for(i = 0;i < count;i++)
-            GetLerpedHrtfCoeffs(device->Hrtf, VirtualChans[i].elevation, VirtualChans[i].angle,
-                                device->Hrtf_Params[i].Coeffs, device->Hrtf_Params[i].Delay);
+            GetLerpedHrtfCoeffs(
+                device->Hrtf, VirtualChans[i].elevation, VirtualChans[i].angle, 1.0f, 1.0f,
+                device->Hrtf_Params[i].Coeffs, device->Hrtf_Params[i].Delay
+            );
 
         return;
     }
diff --git a/OpenAL32/Include/alMain.h b/OpenAL32/Include/alMain.h
index 56df4db3..1afeb1e6 100644
--- a/OpenAL32/Include/alMain.h
+++ b/OpenAL32/Include/alMain.h
@@ -617,7 +617,9 @@ typedef struct HrtfState {
 
 typedef struct HrtfParams {
     alignas(16) ALfloat Coeffs[HRIR_LENGTH][2];
+    alignas(16) ALfloat CoeffStep[HRIR_LENGTH][2];
     ALuint Delay[2];
+    ALint DelayStep[2];
 } HrtfParams;
 
 
diff --git a/OpenAL32/Include/alu.h b/OpenAL32/Include/alu.h
index 56c37fe8..0462fda8 100644
--- a/OpenAL32/Include/alu.h
+++ b/OpenAL32/Include/alu.h
@@ -71,6 +71,13 @@ typedef struct DirectParams {
         ALfilterState HighPass;
     } Filters[MAX_INPUT_CHANNELS];
 
+    struct {
+        HrtfParams Params[MAX_INPUT_CHANNELS];
+        HrtfState State[MAX_INPUT_CHANNELS];
+        ALuint IrSize;
+        ALfloat Gain;
+        ALfloat Dir[3];
+    } Hrtf;
     MixGains Gains[MAX_INPUT_CHANNELS][MAX_OUTPUT_CHANNELS];
 } DirectParams;
 
@@ -99,7 +106,8 @@ typedef void (*MixerFunc)(const ALfloat *data, ALuint OutChans,
                           ALfloat (*restrict OutBuffer)[BUFFERSIZE], struct MixGains *Gains,
                           ALuint Counter, ALuint OutPos, ALuint BufferSize);
 typedef void (*HrtfMixerFunc)(ALfloat (*restrict OutBuffer)[BUFFERSIZE], const ALfloat *data,
-                              ALuint Offset, const ALuint IrSize, const HrtfParams *hrtfparams,
+                              ALuint Counter, ALuint Offset, ALuint OutPos,
+                              const ALuint IrSize, const HrtfParams *hrtfparams,
                               HrtfState *hrtfstate, ALuint BufferSize);
 
 
diff --git a/OpenAL32/alSource.c b/OpenAL32/alSource.c
index 12bd9436..be3768f3 100644
--- a/OpenAL32/alSource.c
+++ b/OpenAL32/alSource.c
@@ -2599,6 +2599,17 @@ ALvoid SetSourceState(ALsource *Source, ALCcontext *Context, ALenum state)
 
         voice->Direct.Moving  = AL_FALSE;
         voice->Direct.Counter = 0;
+        for(i = 0;i < MAX_INPUT_CHANNELS;i++)
+        {
+            ALsizei j;
+            for(j = 0;j < HRTF_HISTORY_LENGTH;j++)
+                voice->Direct.Hrtf.State[i].History[j] = 0.0f;
+            for(j = 0;j < HRIR_LENGTH;j++)
+            {
+                voice->Direct.Hrtf.State[i].Values[j][0] = 0.0f;
+                voice->Direct.Hrtf.State[i].Values[j][1] = 0.0f;
+            }
+        }
         for(i = 0;i < (ALsizei)device->NumAuxSends;i++)
         {
             voice->Send[i].Moving  = AL_FALSE;
-- 
2.11.4.GIT