From d236065b3690e8c7d9508a1483f3353d188d0786 Mon Sep 17 00:00:00 2001
From: jethead71 <jethead71@a1c6a512-1295-4272-9138-f99709370657>
Date: Sun, 2 May 2010 02:44:45 +0000
Subject: [PATCH] Do some SPC codec optimizing for ARMv6 (as a training
 exercise), tweak realtime BRR for all CPU that use it, add Gaussian ASM
 optimization for all ARM that can use it. Add some LIKELY/UNLIKELY branch
 hints. On Gigabeat-S gives +22% speedup. For Gigabeat F, about +5% speedup.
 For less-powerful players, no real change aside possibly from branch hints.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@25771 a1c6a512-1295-4272-9138-f99709370657
---
 apps/codecs/libspc/spc_codec.h |  32 ++-
 apps/codecs/libspc/spc_dsp.c   | 510 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 452 insertions(+), 90 deletions(-)

diff --git a/apps/codecs/libspc/spc_codec.h b/apps/codecs/libspc/spc_codec.h
index cf72f90af..95d09fa09 100644
--- a/apps/codecs/libspc/spc_codec.h
+++ b/apps/codecs/libspc/spc_codec.h
@@ -37,6 +37,10 @@
 
 /** Basic configuration options **/
 
+#ifndef ARM_ARCH
+#define ARM_ARCH 0
+#endif
+
 #define SPC_DUAL_CORE 1
 
 #if !defined(SPC_DUAL_CORE) || NUM_CORES == 1
@@ -293,6 +297,15 @@ enum
     FIR_BUF_MASK  = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) - 1))
 };
 #elif defined (CPU_ARM)
+#if ARM_ARCH >= 6
+enum
+{
+    FIR_BUF_CNT   = FIR_BUF_HALF * 2,
+    FIR_BUF_SIZE  = FIR_BUF_CNT * sizeof ( int32_t ),
+    FIR_BUF_ALIGN = FIR_BUF_SIZE,
+    FIR_BUF_MASK  = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) - 1))
+};
+#else
 enum
 {
     FIR_BUF_CNT   = FIR_BUF_HALF * 2 * 2,
@@ -300,6 +313,7 @@ enum
     FIR_BUF_ALIGN = FIR_BUF_SIZE,
     FIR_BUF_MASK  = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) * 2 - 1))
 };
+#endif /* ARM_ARCH */
 #endif /* CPU_* */
 
 struct Spc_Dsp
@@ -318,7 +332,8 @@ struct Spc_Dsp
     uint16_t noise; /* also read as int16_t */
     
 #if defined(CPU_COLDFIRE)
-    /* circularly hardware masked address */
+    /* FIR history is interleaved. Hardware handles wrapping by mask.
+     * |LR|LR|LR|LR|LR|LR|LR|LR| */
     int32_t *fir_ptr;
     /* wrapped address just behind current position -
        allows mac.w to increment and mask fir_ptr */
@@ -328,9 +343,22 @@ struct Spc_Dsp
 #elif defined (CPU_ARM)
    /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */
     int32_t *fir_ptr;
+#if ARM_ARCH >= 6
+    /* FIR history is interleaved with guard to eliminate wrap checking
+     * when convolving.
+     * |LR|LR|LR|LR|LR|LR|LR|LR|--|--|--|--|--|--|--|--| */
+    /* copy of echo FIR constants as int16_t, loaded as int32 for
+     * halfword, packed multiples */
+    int16_t fir_coeff [VOICE_COUNT];
+#else
+    /* FIR history is interleaved with guard to eliminate wrap checking
+     * when convolving.
+     * |LL|RR|LL|RR|LL|RR|LL|RR|LL|RR|LL|RR|LL|RR|LL|RR|...
+     * |--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--| */
     /* copy of echo FIR constants as int32_t, for faster access */
     int32_t fir_coeff [VOICE_COUNT]; 
-#else
+#endif /* ARM_ARCH */
+#else /* Unoptimized CPU */
     /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */
     int fir_pos; /* (0 to 7) */
     int fir_buf [FIR_BUF_HALF * 2] [2];
diff --git a/apps/codecs/libspc/spc_dsp.c b/apps/codecs/libspc/spc_dsp.c
index 5ea651478..0d07e5f04 100644
--- a/apps/codecs/libspc/spc_dsp.c
+++ b/apps/codecs/libspc/spc_dsp.c
@@ -57,6 +57,16 @@ void DSP_write( struct Spc_Dsp* this, int i, int data )
     }
 }
 
+#if ARM_ARCH >= 6
+/* if ( n < -32768 ) out = -32768; */
+/* if ( n >  32767 ) out =  32767; */
+#define CLAMP16( n ) \
+    ({ \
+       asm ("ssat %0, #16, %1" \
+            : "=r" ( n ) : "r"( n ) ); \
+       n; \
+    })
+#else
 /* if ( n < -32768 ) out = -32768; */
 /* if ( n >  32767 ) out =  32767; */
 #define CLAMP16( n ) \
@@ -65,6 +75,7 @@ void DSP_write( struct Spc_Dsp* this, int i, int data )
         n = 0x7FFF ^ (n >> 31); \
     n;                          \
 })
+#endif
 
 #if SPC_BRRCACHE
 static void decode_brr( struct Spc_Dsp* this, unsigned start_addr,
@@ -418,7 +429,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
             /* Key on events are delayed */
             int key_on_delay = voice->key_on_delay;
 
-            if ( --key_on_delay >= 0 ) /* <1% of the time */
+            if ( UNLIKELY ( --key_on_delay >= 0 ) ) /* <1% of the time */
             {
                 key_on(this,voice,sd,raw_voice,key_on_delay,vbit);
             }
@@ -438,13 +449,13 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
                 int env_mode = voice->env_mode;
                 int adsr0 = raw_voice->adsr [0];
                 int env_timer;
-                if ( env_mode != state_release ) /* 99% of the time */
+                if ( LIKELY ( env_mode != state_release ) ) /* 99% of the time */
                 {
                     env_timer = voice->env_timer;
-                    if ( adsr0 & 0x80 ) /* 79% of the time */
+                    if ( LIKELY ( adsr0 & 0x80 ) ) /* 79% of the time */
                     {
                         int adsr1 = raw_voice->adsr [1];
-                        if ( env_mode == state_sustain ) /* 74% of the time */
+                        if ( LIKELY ( env_mode == state_sustain ) ) /* 74% of the time */
                         {
                             if ( (env_timer -= env_rates [adsr1 & 0x1F]) > 0 )
                                 goto write_env_timer;
@@ -607,25 +618,12 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
                         goto skip_decode;
                     }
                 }
-                
+
                 /* header */
                 int const block_header = *addr;
                 addr += 9;
                 voice->addr = addr;
                 voice->block_header = block_header;
-                int const filter = (block_header & 0x0C) - 0x08;
-                
-                /* scaling (invalid scaling gives -4096 for neg nybble,
-                   0 for pos) */
-                static unsigned char const right_shifts [16] = {
-                    5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,  4,  4, 29, 29, 29,
-                };
-                static unsigned char const left_shifts  [16] = {
-                    0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11
-                };
-                int const scale = block_header >> 4;
-                int const right_shift = right_shifts [scale];
-                int const left_shift  = left_shifts  [scale];
                 
                 /* previous samples */
                 int smp2 = voice->samples [BRR_BLOCK_SIZE + 1];
@@ -650,54 +648,117 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
                     /* force sample to end on next decode */
                     voice->block_header = 1;
                 }
-                
-                do /* decode and filter 16 samples */
+
+                int const filter = block_header & 0x0c;
+                int const scale = block_header >> 4;
+
+                if ( filter == 0x08 ) /* filter 2 (30-90% of the time) */
                 {
-                    /* Get nybble, sign-extend, then scale
-                       get byte, select which nybble, sign-extend, then shift
-                       based on scaling. also handles invalid scaling values.*/
-                    int delta = (int) (int8_t) (addr [offset >> 3] <<
-                            (offset & 4)) >> right_shift << left_shift;
-                    
-                    out [offset >> 2] = smp2;
-                    
-                    if ( filter == 0 ) /* mode 0x08 (30-90% of the time) */
+                    /* y[n] = x[n] + 61/32 * y[n-1] - 15/16 * y[n-2] */
+                    do /* decode and filter 16 samples */
                     {
+                        /* Get nybble, sign-extend, then scale
+                           get byte, select which nybble, sign-extend, then shift
+                           based on scaling. */
+                        int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
+                        delta = (delta << scale) >> 1;
+
+                        if (scale > 0xc)
+                            delta = (delta >> 17) << 11;
+
+                        out [offset >> 2] = smp2;
+
                         delta -= smp2 >> 1;
                         delta += smp2 >> 5;
-                        smp2 = smp1;
                         delta += smp1;
                         delta += (-smp1 - (smp1 >> 1)) >> 5;
+
+                        delta = CLAMP16( delta );
+                        smp2 = smp1;
+                        smp1 = (int16_t) (delta * 2); /* sign-extend */
                     }
-                    else
+                    while ( (offset += 4) != 0 );
+                }
+                else if ( filter == 0x04 ) /* filter 1 */
+                {
+                    /* y[n] = x[n] + 15/16 * y[n-1] */
+                    do /* decode and filter 16 samples */
                     {
-                        if ( filter == -4 ) /* mode 0x04 */
-                        {
-                            delta += smp1 >> 1;
-                            delta += (-smp1) >> 5;
-                        }
-                        else if ( filter > -4 ) /* mode 0x0C */
-                        {
-                            delta -= smp2 >> 1;
-                            delta += (smp2 + (smp2 >> 1)) >> 4;
-                            delta += smp1;
-                            delta += (-smp1 * 13) >> 7;
-                        }
+                        /* Get nybble, sign-extend, then scale
+                           get byte, select which nybble, sign-extend, then shift
+                           based on scaling. */
+                        int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
+                        delta = (delta << scale) >> 1;
+
+                        if (scale > 0xc)
+                            delta = (delta >> 17) << 11;
+                    
+                        out [offset >> 2] = smp2;
+
+                        delta += smp1 >> 1;
+                        delta += (-smp1) >> 5;
+
+                        delta = CLAMP16( delta );
                         smp2 = smp1;
+                        smp1 = (int16_t) (delta * 2); /* sign-extend */
                     }
-                    
-                    delta = CLAMP16( delta );
-                    smp1 = (int16_t) (delta * 2); /* sign-extend */
+                    while ( (offset += 4) != 0 );
                 }
-                while ( (offset += 4) != 0 );
-                
+                else if ( filter == 0x0c ) /* filter 3 */
+                {
+                    /* y[n] = x[n] + 115/64 * y[n-1] - 13/16 * y[n-2] */
+                    do /* decode and filter 16 samples */
+                    {
+                        /* Get nybble, sign-extend, then scale
+                           get byte, select which nybble, sign-extend, then shift
+                           based on scaling. */
+                        int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
+                        delta = (delta << scale) >> 1;
+
+                        if (scale > 0xc)
+                            delta = (delta >> 17) << 11;
+
+                        out [offset >> 2] = smp2;
+
+                        delta -= smp2 >> 1;
+                        delta += (smp2 + (smp2 >> 1)) >> 4;
+                        delta += smp1;
+                        delta += (-smp1 * 13) >> 7;
+
+                        delta = CLAMP16( delta );
+                        smp2 = smp1;
+                        smp1 = (int16_t) (delta * 2); /* sign-extend */
+                    }
+                    while ( (offset += 4) != 0 );
+                }
+                else /* filter 0 */
+                {
+                    /* y[n] = x[n] */
+                    do /* decode and filter 16 samples */
+                    {
+                        /* Get nybble, sign-extend, then scale
+                           get byte, select which nybble, sign-extend, then shift
+                           based on scaling. */
+                        int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
+                        delta = (delta << scale) >> 1;
+
+                        if (scale > 0xc)
+                            delta = (delta >> 17) << 11;
+
+                        out [offset >> 2] = smp2;
+
+                        smp2 = smp1;
+                        smp1 = delta * 2;
+                    }
+                    while ( (offset += 4) != 0 );
+                }
+
                 out [0] = smp2;
                 out [1] = smp1;
                 
             skip_decode:;
             }
-            #endif
-
+        #endif /* !SPC_BRRCACHE */
             /* Get rate (with possible modulation) */
             int rate = VOICE_RATE(vr);
             if ( this->r.g.pitch_mods & vbit )
@@ -754,13 +815,87 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
             
             /* Use faster gaussian interpolation when exact result isn't needed
                by pitch modulator of next channel */
-            int amp_0, amp_1;
-            if ( !(slow_gaussian & vbit) ) /* 99% of the time */
+            int amp_0, amp_1; /* Also serve as temps _0, and _1 */
+            if ( LIKELY ( !(slow_gaussian & vbit) ) ) /* 99% of the time */
             {
                 /* Main optimization is lack of clamping. Not a problem since
                    output never goes more than +/- 16 outside 16-bit range and
                    things are clamped later anyway. Other optimization is to
                    preserve fractional accuracy, eliminating several masks. */
+            #if defined (CPU_ARM)
+                int output;
+                int _2, _3; /* All-purpose temps */
+                /* Multiple ASM blocks keep regs free and reduce result
+                 * latency issues. */
+            #if ARM_ARCH >= 6
+                /* Interpolate */
+                asm volatile (
+                "ldr     %[_0], [%[interp]]           \r\n" /* _0=i0i1            */
+                "ldr     %[_2], [%[fwd]]              \r\n" /* _2=f0f1            */
+                "ldr     %[_1], [%[interp], #4]       \r\n" /* _1=i2i3            */
+                "ldr     %[_3], [%[rev]]              \r\n" /* _3=r0r1            */
+                "smuad   %[out], %[_0], %[_2]         \r\n" /* out=f0*i0 + f1*i1  */
+                "smladx  %[out], %[_1], %[_3], %[out] \r\n" /* out+=r1*i2 + r0*i3 */
+                : [out]"=&r"(output),
+                  [_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
+                  [_2]"=&r"(_2), [_3]"=&r"(_3)
+                : [fwd]"r"(fwd), [rev]"r"(rev),
+                  [interp]"r"(interp));
+                /* Apply voice envelope */
+                asm volatile (
+                "mov     %[_2], %[out], asr #(11-5)   \r\n" /* To do >> 16 later */
+                "mul     %[out], %[_2], %[envx]       \r\n" /* and avoid exp. shift */
+                : [out]"+r"(output), [_2]"=&r"(_2)
+                : [envx]"r"((int)voice->envx));
+                /* Apply left and right volume */
+                asm volatile (
+                "smulwb  %[amp_0], %[out], %[vvol_0]  \r\n" /* (32x16->48)[47:16]->[31:0] */
+                "smulwb  %[amp_1], %[out], %[vvol_1]  \r\n"
+                : [out]"+r"(output),
+                  [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1)
+                : [vvol_0]"r"(voice->volume[0]),
+                  [vvol_1]"r"(voice->volume[1]));
+
+                raw_voice->outx = output >> (8+5); /* 'output' still 5 bits too big */
+            #else /* ARM_ARCH < 6 */
+                /* Perform gaussian interpolation on four samples */
+                asm volatile (
+                "ldrsh   %[_0], [%[interp]]           \r\n"
+                "ldrsh   %[_2], [%[fwd]]              \r\n"
+                "ldrsh   %[_1], [%[interp], #2]       \r\n"
+                "ldrsh   %[_3], [%[fwd], #2]          \r\n"
+                "mul     %[out], %[_0], %[_2]         \r\n" /* out= fwd[0]*interp[0] */
+                "ldrsh   %[_0], [%[interp], #4]       \r\n"
+                "ldrsh   %[_2], [%[rev], #2]          \r\n"
+                "mla     %[out], %[_1], %[_3], %[out] \r\n" /* out+=fwd[1]*interp[1] */
+                "ldrsh   %[_1], [%[interp], #6]       \r\n"
+                "ldrsh   %[_3], [%[rev]]              \r\n"
+                "mla     %[out], %[_0], %[_2], %[out] \r\n" /* out+=rev[1]*interp[2] */
+                "mla     %[out], %[_1], %[_3], %[out] \r\n" /* out+=rev[0]*interp[3] */
+                : [out]"=&r"(output),
+                  [_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
+                  [_2]"=&r"(_2), [_3]"=&r"(_3)
+                : [fwd]"r"(fwd), [rev]"r"(rev),
+                  [interp]"r"(interp));
+                /* Apply voice envelope */
+                asm volatile (
+                "mov     %[_2], %[out], asr #11       \r\n"
+                "mul     %[out], %[_2], %[envx]       \r\n"
+                : [out]"+r"(output), [_2]"=&r"(_2)
+                : [envx]"r"((int)voice->envx));
+                /* Reduce and apply left and right volume */
+                asm volatile (
+                "mov    %[out], %[out], asr #11       \r\n"
+                "mul    %[amp_0], %[out], %[vvol_0]   \r\n"
+                "mul    %[amp_1], %[out], %[vvol_1]   \r\n"
+                : [out]"+r"(output),
+                  [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1)
+                : [vvol_0]"r"((int)voice->volume[0]),
+                  [vvol_1]"r"((int)voice->volume[1]));
+
+                raw_voice->outx = output >> 8;
+            #endif /* ARM_ARCH */
+            #else /* Unoptimized CPU */
                 int output = (((fwd [0] * interp [0] +
                          fwd [1] * interp [1] +
                          rev [1] * interp [2] +
@@ -769,11 +904,121 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
                 /* duplicated here to give compiler more to run in parallel */
                 amp_0 = voice->volume [0] * output;
                 amp_1 = voice->volume [1] * output;
+
                 raw_voice->outx = output >> 8;
+            #endif /* CPU_* */
             }
-            else
+            else /* slow gaussian */
             {
+            #if defined(CPU_ARM)
+            #if ARM_ARCH >= 6
+                int output = *(int16_t*) &this->noise;
+
+                if ( !(this->r.g.noise_enables & vbit) )
+                {
+                    /* Interpolate */
+                    int _2, _3;
+                    asm volatile (
+                    /* NOTE: often-unaligned accesses */
+                    "ldr     %[_0], [%[interp]]            \r\n" /* _0=i0i1   */
+                    "ldr     %[_2], [%[fwd]]               \r\n" /* _2=f0f1   */
+                    "ldr     %[_1], [%[interp], #4]        \r\n" /* _1=i2i3   */
+                    "ldr     %[_3], [%[rev]]               \r\n" /* _3=f2f3   */
+                    "smulbb  %[out], %[_0], %[_2]          \r\n" /* out=f0*i0 */
+                    "smultt  %[_0],  %[_0], %[_2]          \r\n" /* _0=f1*i1  */
+                    "smulbt  %[_2],  %[_1], %[_3]          \r\n" /* _2=r1*i2  */
+                    "smultb  %[_3],  %[_1], %[_3]          \r\n" /* _3=r0*i3  */
+                    : [out]"=r"(output),
+                      [_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
+                      [_2]"=&r"(_2), [_3]"=&r"(_3)
+                    : [fwd]"r"(fwd), [rev]"r"(rev),
+                      [interp]"r"(interp));
+                    asm volatile (
+                    "mov     %[out], %[out], asr#12        \r\n"
+                    "add     %[_0], %[out], %[_0], asr #12 \r\n"
+                    "add     %[_2], %[_0], %[_2], asr #12  \r\n"
+                    "pkhbt   %[_0], %[_2], %[_3], asl #4   \r\n" /* _3[31:16], _2[15:0] */
+                    "sadd16  %[_0], %[_0], %[_0]           \r\n" /* _3[31:16]*2, _2[15:0]*2 */
+                    "qsubaddx %[out], %[_0], %[_0]         \r\n" /* out[15:0]=
+                                                                  * sat16(_3[31:16]+_2[15:0]) */
+                    : [out]"+r"(output),
+                      [_0]"+r"(amp_0), [_2]"+r"(_2), [_3]"+r"(_3));
+                }
+                /* Apply voice envelope */
+                asm volatile (
+                "smulbb  %[out], %[out], %[envx]       \r\n"
+                : [out]"+r"(output)
+                : [envx]"r"(voice->envx));
+                /* Reduce and apply left and right volume */
+                asm volatile (
+                "mov     %[out], %[out], asr #11       \r\n"
+                "bic     %[out], %[out], #0x1          \r\n"
+                "mul     %[amp_0], %[out], %[vvol_0]   \r\n"
+                "mul     %[amp_1], %[out], %[vvol_1]   \r\n"
+                : [out]"+r"(output),
+                  [amp_1]"=r"(amp_1), [amp_0]"=r"(amp_0)
+                : [vvol_0]"r"((int)voice->volume[0]),
+                  [vvol_1]"r"((int)voice->volume[1]));
+
+                prev_outx = output;
+                raw_voice->outx = output >> 8;
+            #else /* ARM_ARCH < 6 */
+                int output = *(int16_t*) &this->noise;
+
+                if ( !(this->r.g.noise_enables & vbit) )
+                {
+                    /* Interpolate */
+                    int _2, _3;
+                    asm volatile (
+                    "ldrsh   %[_0], [%[interp]]            \r\n"
+                    "ldrsh   %[_2], [%[fwd]]               \r\n"
+                    "ldrsh   %[_1], [%[interp], #2]        \r\n"
+                    "ldrsh   %[_3], [%[fwd], #2]           \r\n"
+                    "mul     %[out], %[_2], %[_0]          \r\n" /* fwd[0]*interp[0] */
+                    "ldrsh   %[_2], [%[rev], #2]           \r\n"
+                    "mul     %[_0], %[_3], %[_1]           \r\n" /* fwd[1]*interp[1] */
+                    "ldrsh   %[_1], [%[interp], #4]        \r\n"
+                    "mov     %[out], %[out], asr #12       \r\n"
+                    "ldrsh   %[_3], [%[rev]]               \r\n"
+                    "mul     %[_2], %[_1], %[_2]           \r\n" /* rev[1]*interp[2] */
+                    "ldrsh   %[_1], [%[interp], #6]        \r\n"
+                    "add     %[_0], %[out], %[_0], asr #12 \r\n"
+                    "mul     %[_3], %[_1], %[_3]           \r\n" /* rev[0]*interp[3] */
+                    "add     %[_2], %[_0], %[_2], asr #12  \r\n"
+                    "mov     %[_2], %[_2], lsl #17         \r\n"
+                    "mov     %[_3], %[_3], asr #12         \r\n"
+                    "mov     %[_3], %[_3], asl #1          \r\n"
+                    "add     %[out], %[_3], %[_2], asr #16 \r\n"
+                    : [out]"=r"(output),
+                      [_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
+                      [_2]"=&r"(_2), [_3]"=&r"(_3)
+                    : [fwd]"r"(fwd), [rev]"r"(rev),
+                      [interp]"r"(interp));
+
+                    output = CLAMP16(output);
+                }
+                /* Apply voice envelope */
+                asm volatile (
+                    "mul     %[_0], %[out], %[envx]        \r\n"
+                    : [_0]"=r"(amp_0)
+                    : [out]"r"(output), [envx]"r"((int)voice->envx));
+                /* Reduce and apply left and right volume */
+                asm volatile (
+                    "mov     %[out], %[amp_0], asr #11     \r\n" /* amp_0 = _0 */
+                    "bic     %[out], %[out], #0x1          \r\n"
+                    "mul     %[amp_0], %[out], %[vvol_0]   \r\n"
+                    "mul     %[amp_1], %[out], %[vvol_1]   \r\n"
+                : [out]"+r"(output), [amp_0]"+r"(amp_0),
+                  [amp_1]"=r"(amp_1)
+                : [vvol_0]"r"((int)voice->volume[0]),
+                  [vvol_1]"r"((int)voice->volume[1]));
+
+                prev_outx = output;
+                raw_voice->outx = output >> 8;
+            #endif /* ARM_ARCH >= 6 */    
+            #else /* Unoptimized CPU */
                 int output = *(int16_t*) &this->noise;
+
                 if ( !(this->r.g.noise_enables & vbit) )
                 {
                     output = (fwd [0] * interp [0]) & ~0xFFF;
@@ -788,8 +1033,10 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
                 /* duplicated here to give compiler more to run in parallel */
                 amp_0 = voice->volume [0] * output;
                 amp_1 = voice->volume [1] * output;
+
                 prev_outx = output;
-                raw_voice->outx = (int8_t) (output >> 8);
+                raw_voice->outx = output >> 8;
+            #endif /* CPU_* */
             }
         #else /* SPCNOINTERP */
         /* two-point linear interpolation */
@@ -826,16 +1073,14 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
                 "asr.l      %[sh], %[y1]              \r\n"
                 "add.l      %[y0], %[y1]              \r\n"
                 : [f]"+d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0)
-                : [s]"a"(voice->samples), [sh]"d"(12)
-                    );
+                : [s]"a"(voice->samples), [sh]"d"(12));
             }
 
             /* apply voice envelope to output */
             asm volatile (
-            "mac.w %[output]l, %[envx]l, %%acc0 \r\n"
+            "mac.w %[out]l, %[envx]l, %%acc0 \r\n"
             :
-            : [output]"r"(amp_0), [envx]"r"(voice->envx)
-            );
+            : [out]"r"(amp_0), [envx]"r"(voice->envx));
 
             /* advance voice position */
             voice->position += rate;
@@ -843,15 +1088,14 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
             /* fetch output, scale and apply left and right
                voice volume */
             asm volatile (
-            "movclr.l %%acc0,    %[output]         \r\n"
-            "asr.l    %[sh],     %[output]         \r\n"
-            "mac.l    %[vvol_0], %[output], %%acc0 \r\n"
-            "mac.l    %[vvol_1], %[output], %%acc1 \r\n"
-            : [output]"=&d"(amp_0)
+            "movclr.l %%acc0,    %[out]         \r\n"
+            "asr.l    %[sh],     %[out]         \r\n"
+            "mac.l    %[vvol_0], %[out], %%acc0 \r\n"
+            "mac.l    %[vvol_1], %[out], %%acc1 \r\n"
+            : [out]"=&d"(amp_0)
             : [vvol_0]"r"((int)voice->volume[0]),
               [vvol_1]"r"((int)voice->volume[1]),
-              [sh]"d"(11)
-            );
+              [sh]"d"(11));
 
             /* save this output into previous, scale and save in
                output register */
@@ -862,14 +1106,16 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
             asm volatile (
             "movclr.l %%acc0, %[amp_0] \r\n"
             "movclr.l %%acc1, %[amp_1] \r\n"
-            : [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1)
-            );
+            : [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1));
         #elif defined (CPU_ARM)
             int amp_0, amp_1;
             
-            if ( (this->r.g.noise_enables & vbit) != 0 ) {
+            if ( (this->r.g.noise_enables & vbit) != 0 )
+            {
                 amp_0 = *(int16_t *)&this->noise;
-            } else {
+            }
+            else
+            {
                 uint32_t f = voice->position;
                 amp_0 = (uint32_t)voice->samples;
 
@@ -882,8 +1128,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
                 "sub    %[y1], %[y1], %[y0]         \r\n"
                 "mul    %[f], %[y1], %[f]           \r\n"
                 "add    %[y0], %[y0], %[f], asr #12 \r\n"
-                : [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1)
-                );
+                : [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1));
             }
 
             voice->position += rate;
@@ -893,8 +1138,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
             "mov    %[amp_0], %[amp_1], asr #11 \r\n"
             "mov    %[amp_1], %[amp_0], asr #8  \r\n"
             : [amp_0]"+r"(amp_0), [amp_1]"=&r"(amp_1)
-            : [envx]"r"(voice->envx)
-            );
+            : [envx]"r"(voice->envx));
 
             prev_outx = amp_0;
             raw_voice->outx = (int8_t)amp_1;
@@ -904,8 +1148,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
             "mul    %[amp_0], %[vol_0], %[amp_0] \r\n"
             : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1)
             : [vol_0]"r"((int)voice->volume[0]),
-              [vol_1]"r"((int)voice->volume[1])
-            );
+              [vol_1]"r"((int)voice->volume[1]));
         #else /* Unoptimized CPU */
             int output;
             
@@ -1089,25 +1332,116 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
             echo_pos = 0;
         this->echo_pos = echo_pos;
 
+    #if ARM_ARCH >= 6
+        int32_t *fir_ptr, *fir_coeff;
+        int fb_0, fb_1;
+
+        /* Apply FIR */
+        fb_0 = *(uint32_t *)echo_ptr;
+
+        /* Keep last 8 samples */
+        asm volatile (
+        "add    %[fir_p], %[t_fir_p], #4      \r\n"
+        "bic    %[t_fir_p], %[fir_p], %[mask] \r\n"
+        "str    %[fb_0], [%[fir_p], #-4]      \r\n"
+        /* duplicate at +8 eliminates wrap checking below */
+        "str    %[fb_0], [%[fir_p], #28]      \r\n"
+        : [fir_p]"=&r"(fir_ptr), [t_fir_p]"+r"(this->fir_ptr)
+        : [fb_0]"r"(fb_0), [mask]"i"(~FIR_BUF_MASK));
+
+        fir_coeff = (int32_t *)this->fir_coeff;
+
+        /* Fugly, but the best version found. */
+        int _0;
+        asm volatile (                             /* L0R0 = acc0          */
+        "ldmia   %[fir_p]!, { r2-r5 }        \r\n" /* L1R1-L4R4 = r2-r5    */
+        "ldmia   %[fir_c]!, { r0-r1 }        \r\n" /* C0C1-C2C3 = r0-r1    */
+        "pkhbt   %[_0], %[acc0], r2, asl #16 \r\n" /* L0R0,L1R1->L0L1,R0R1 */
+        "pkhtb   r2, r2, %[acc0], asr #16    \r\n"
+        "smuad   %[acc0], %[_0], r0          \r\n" /* acc0=L0*C0+L1*C1     */
+        "smuad   %[acc1], r2, r0             \r\n" /* acc1=R0*C0+R1*C1     */
+        "pkhbt   %[_0], r3, r4, asl #16      \r\n" /* L2R2,L3R3->L2L3,R2R3 */
+        "pkhtb   r4, r4, r3, asr #16         \r\n"
+        "smlad   %[acc0], %[_0], r1, %[acc0] \r\n" /* acc0+=L2*C2+L3*C3    */
+        "smlad   %[acc1], r4, r1, %[acc1]    \r\n" /* acc1+=R2*C2+R3*C3    */
+        "ldmia   %[fir_p], { r2-r4 }         \r\n" /* L5R5-L7R7 = r2-r4    */
+        "ldmia   %[fir_c], { r0-r1 }         \r\n" /* C4C5-C6C7 = r0-r1    */
+        "pkhbt   %[_0], r5, r2, asl #16      \r\n" /* L4R4,L5R5->L4L5,R4R5 */
+        "pkhtb   r2, r2, r5, asr #16         \r\n"
+        "smlad   %[acc0], %[_0], r0, %[acc0] \r\n" /* acc0+=L4*C4+L5*C5    */
+        "smlad   %[acc1], r2, r0, %[acc1]    \r\n" /* acc1+=R4*C4+R5*C5    */
+        "pkhbt   %[_0], r3, r4, asl #16      \r\n" /* L6R6,L7R7->L6L7,R6R7 */
+        "pkhtb   r4, r4, r3, asr #16         \r\n"
+        "smlad   %[acc0], %[_0], r1, %[acc0] \r\n" /* acc0+=L6*C6+L7*C7    */
+        "smlad   %[acc1], r4, r1, %[acc1]    \r\n" /* acc1+=R6*C6+R7*C7    */
+        : [acc0]"+r"(fb_0), [acc1]"=&r"(fb_1), [_0]"=&r"(_0),
+          [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff)
+        :
+        : "r0", "r1", "r2", "r3", "r4", "r5");
+
+        /* Generate output */
+        int amp_0, amp_1;
+
+        asm volatile (
+        "mul     %[amp_0], %[gvol_0], %[chans_0] \r\n"
+        "mul     %[amp_1], %[gvol_1], %[chans_1] \r\n"
+        : [amp_0]"=&r"(amp_0), [amp_1]"=&r"(amp_1)
+        : [gvol_0]"r"(global_vol_0), [gvol_1]"r"(global_vol_1),
+          [chans_0]"r"(chans_0), [chans_1]"r"(chans_1));
+        asm volatile (
+        "mla     %[amp_0], %[fb_0], %[ev_0], %[amp_0] \r\n"
+        "mla     %[amp_1], %[fb_1], %[ev_1], %[amp_1] \r\n"
+        : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1)
+        : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1),
+          [ev_0]"r"((int)this->r.g.echo_volume_0),
+          [ev_1]"r"((int)this->r.g.echo_volume_1));
+
+        out_buf [             0] = amp_0 >> global_muting;
+        out_buf [WAV_CHUNK_SIZE] = amp_1 >> global_muting;
+        out_buf ++;
+
+        if ( !(this->r.g.flags & 0x20) )
+        {
+            /* Feedback into echo buffer */
+            int e0, e1;
+
+            asm volatile (
+            "mov     %[e0], %[echo_0], asl #7      \r\n"
+            "mov     %[e1], %[echo_1], asl #7      \r\n"
+            "mla     %[e0], %[fb_0], %[efb], %[e0] \r\n"
+            "mla     %[e1], %[fb_1], %[efb], %[e1] \r\n"
+            : [e0]"=&r"(e0), [e1]"=&r"(e1)
+            : [echo_0]"r"(echo_0), [echo_1]"r"(echo_1),
+              [fb_0]"r"(fb_0), [fb_1]"r"(fb_1),
+              [efb]"r"((int)this->r.g.echo_feedback));
+            asm volatile (
+            "ssat    %[e0], #16, %[e0], asr #14    \r\n"
+            "ssat    %[e1], #16, %[e1], asr #14    \r\n"
+            "pkhbt   %[e0], %[e0], %[e1], lsl #16  \r\n"
+            "str     %[e0], [%[echo_p]]            \r\n"
+            : [e0]"+r"(e0), [e1]"+r"(e1)
+            : [echo_p]"r"(echo_ptr));
+        }
+    #else /* ARM_ARCH < 6 */
         int fb_0 = GET_LE16SA( echo_ptr     );
         int fb_1 = GET_LE16SA( echo_ptr + 2 );
+        int32_t *fir_ptr, *fir_coeff;
 
         /* Keep last 8 samples */
-        int32_t *fir_ptr = this->fir_ptr;
 
         /* Apply FIR */
         asm volatile (
-        "str    %[fb_0], [%[fir_p]], #4  \r\n"
-        "str    %[fb_1], [%[fir_p]], #4  \r\n"
+        "add    %[fir_p], %[t_fir_p], #8      \r\n"
+        "bic    %[t_fir_p], %[fir_p], %[mask] \r\n"
+        "str    %[fb_0], [%[fir_p], #-8]      \r\n"
+        "str    %[fb_1], [%[fir_p], #-4]      \r\n"
         /* duplicate at +8 eliminates wrap checking below */
-        "str    %[fb_0], [%[fir_p], #56] \r\n"
-        "str    %[fb_1], [%[fir_p], #60] \r\n"
-        : [fir_p]"+r"(fir_ptr)
-        : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1)
-        );
+        "str    %[fb_0], [%[fir_p], #56]      \r\n"
+        "str    %[fb_1], [%[fir_p], #60]      \r\n"
+        : [fir_p]"=&r"(fir_ptr), [t_fir_p]"+r"(this->fir_ptr)
+        : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1), [mask]"i"(~FIR_BUF_MASK));
 
-        this->fir_ptr = (int32_t *)((intptr_t)fir_ptr & FIR_BUF_MASK);
-        int32_t *fir_coeff = this->fir_coeff;
+        fir_coeff = this->fir_coeff;
 
         asm volatile (
         "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
@@ -1137,8 +1471,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
         : [fb_0]"+r"(fb_0), [fb_1]"+r"(fb_1),
           [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff)
         :
-        : "r0", "r1", "r2", "r3", "r4", "r5"
-        );
+        : "r0", "r1", "r2", "r3", "r4", "r5");
 
         /* Generate output */
         int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0)
@@ -1160,6 +1493,7 @@ void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
             e1 = CLAMP16( e1 );
             SET_LE16A( echo_ptr + 2, e1 );
         }
+    #endif /* ARM_ARCH */
     #else /* Unoptimized CPU */
         /* Read feedback from echo buffer */
         int echo_pos = this->echo_pos;
-- 
2.11.4.GIT