From 67ff5d0a2e46ebc267b86442641c86935deef65a Mon Sep 17 00:00:00 2001
From: Buschel <Buschel@a1c6a512-1295-4272-9138-f99709370657>
Date: Sun, 30 Aug 2009 14:14:22 +0000
Subject: [PATCH] Further performance optimization of the atrac3 decoder.
 Rework the internal sample representation and usage of dsp routines. For now
 a quick and dirty solution is used to add a fract part of 2 bits. Through
 this several buffers and functions as well as copy loops could be removed.
 Furthermore add some ASM for coldfire and place some additional data in IRAM
 on PP5022/24 and X5/M5. Speedup on ARM: +3%, speedup on Coldfire: +639%. Both
 ARM and Coldfire can decode in realtime now.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@22561 a1c6a512-1295-4272-9138-f99709370657
---
 apps/codecs/atrac3_rm.c          |  9 +++----
 apps/codecs/libatrac/atrac3.c    | 46 ++++++++++++-------------------
 apps/codecs/libatrac/atrac3.h    | 11 ++++++--
 apps/codecs/libatrac/fixp_math.h | 58 +++++++++++++++++++++++-----------------
 4 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/apps/codecs/atrac3_rm.c b/apps/codecs/atrac3_rm.c
index a8610f5764..f3bfa2f801 100644
--- a/apps/codecs/atrac3_rm.c
+++ b/apps/codecs/atrac3_rm.c
@@ -41,7 +41,6 @@ enum codec_status codec_main(void)
     static size_t buff_size;
     int datasize, res, consumed, i, time_offset;
     uint8_t *bit_buffer;
-    int16_t outbuf[2048] __attribute__((aligned(32)));
     uint16_t fs,sps,h;
     uint32_t packet_count;
     int scrambling_unit_size, num_units, elapsed = 0;
@@ -62,9 +61,9 @@ next_track:
     init_rm(&rmctx);
  
     ci->configure(DSP_SET_FREQUENCY, ci->id3->frequency);
-    ci->configure(DSP_SET_SAMPLE_DEPTH, 16);
+    ci->configure(DSP_SET_SAMPLE_DEPTH, 17); /* Remark: atrac3 uses s15.0 by default, s15.2 was hacked. */
     ci->configure(DSP_SET_STEREO_MODE, rmctx.nb_channels == 1 ?
-                  STEREO_MONO : STEREO_INTERLEAVED);
+        STEREO_MONO : STEREO_NONINTERLEAVED);
 
     packet_count = rmctx.nb_packets;
     rmctx.audio_framesize = rmctx.block_align;
@@ -145,7 +144,7 @@ seek_start :
                 ci->seek_complete(); 
             }
             if(pkt.length)    
-                res = atrac3_decode_frame(&rmctx,&q, outbuf, &datasize, pkt.frames[i], rmctx.block_align);
+                res = atrac3_decode_frame(&rmctx, &q, &datasize, pkt.frames[i], rmctx.block_align);
             else /* indicates that there are no remaining frames */
                 goto done;
 
@@ -155,7 +154,7 @@ seek_start :
             }
 
             if(datasize)
-                ci->pcmbuf_insert(outbuf, NULL, q.samples_per_frame / rmctx.nb_channels);
+                ci->pcmbuf_insert(q.outSamples, q.outSamples + 1024, q.samples_per_frame / rmctx.nb_channels);
             elapsed = rmctx.audiotimestamp+(1000*8*sps/rmctx.bit_rate)*i;
             ci->set_elapsed(elapsed);
             rmctx.frame_number++;
diff --git a/apps/codecs/libatrac/atrac3.c b/apps/codecs/libatrac/atrac3.c
index dd00224e48..cd49aec348 100644
--- a/apps/codecs/libatrac/atrac3.c
+++ b/apps/codecs/libatrac/atrac3.c
@@ -55,18 +55,9 @@
 #define FFMIN(a,b) ((a) > (b) ? (b) : (a))
 #define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0)
 
-/**
- * Clips a signed integer value into the -32768,32767 range.
- */
-static inline int16_t av_clip_int16(int a)
-{
-    if ((a+32768) & ~65535) return (a>>31) ^ 32767;
-    else                    return a;
-}
-
 static int32_t          qmf_window[48] IBSS_ATTR;
 static VLC              spectral_coeff_tab[7];
-static channel_unit     channel_units[2];
+static channel_unit     channel_units[2] IBSS_ATTR_LARGE_IRAM;
 
 /**
  * Matrixing within quadrature mirror synthesis filter.
@@ -516,7 +507,6 @@ static void gainCompensateAndOverlap (int32_t *pIn, int32_t *pPrev, int32_t *pOu
     int32_t  gain1, gain2, gain_inc;
     int   cnt, numdata, nsample, startLoc, endLoc;
 
-
     if (pGain2->num_gain_data == 0)
         gain1 = ONE_16;
     else
@@ -735,7 +725,16 @@ static int decodeChannelSoundUnit (GetBitContext *gb, channel_unit *pSnd, int32_
     numBands = (subbandTab[numSubbands] - 1) >> 8;
     if (lastTonal >= 0)
         numBands = FFMAX((lastTonal + 256) >> 8, numBands);
-
+        
+    /* Remark: Hardcoded hack to add 2 bits (empty) fract part to internal sample
+     * representation. Needed for higher accuracy in internal calculations as
+     * well as for DSP configuration. See also: ../atrac3_rm.c, DSP_SET_SAMPLE_DEPTH 
+     * Todo: Check spectral requantisation for using and outputting samples with 
+     * fract part. */
+    int32_t i;
+    for (i=0; i<1024; ++i) {
+        pSnd->spectrum[i] <<= 2;
+    }
 
     /* Reconstruct time domain samples. */
     for (band=0; band<4; band++) {
@@ -863,11 +862,9 @@ static int decodeFrame(ATRAC3Context *q, const uint8_t* databuf, int off)
  */
 
 int atrac3_decode_frame(RMContext *rmctx, ATRAC3Context *q,
-            void *data, int *data_size,
-            const uint8_t *buf, int buf_size) {
-    int result = 0, off = 0, i;
+            int *data_size, const uint8_t *buf, int buf_size) {
+    int result = 0, off = 0;
     const uint8_t* databuf;
-    int16_t* samples = data;
 
     if (buf_size < rmctx->block_align)
         return buf_size;
@@ -887,19 +884,10 @@ int atrac3_decode_frame(RMContext *rmctx, ATRAC3Context *q,
         return -1;
     }
 
-    if (q->channels == 1) {
-        /* mono */
-        for (i = 0; i<1024; i++)
-            samples[i] = av_clip_int16(q->outSamples[i]);
-        *data_size = 1024 * sizeof(int16_t);
-    } else {
-        /* stereo */
-        for (i = 0; i < 1024; i++) {
-            samples[i*2] = av_clip_int16(q->outSamples[i]);
-            samples[i*2+1] = av_clip_int16(q->outSamples[1024+i]);
-        }
-        *data_size = 2048 * sizeof(int16_t);
-    }
+    if (q->channels == 1)
+        *data_size = 1024 * sizeof(int32_t);
+    else
+        *data_size = 2048 * sizeof(int32_t);
 
     return rmctx->block_align;
 }
diff --git a/apps/codecs/libatrac/atrac3.h b/apps/codecs/libatrac/atrac3.h
index f81fc0a734..a817db2b55 100644
--- a/apps/codecs/libatrac/atrac3.h
+++ b/apps/codecs/libatrac/atrac3.h
@@ -1,6 +1,14 @@
 #include "ffmpeg_bitstream.h"
 #include "../librm/rm.h"
 
+#if (CONFIG_CPU == PP5022) || (CONFIG_CPU == PP5024) || (CONFIG_CPU == MCF5250)
+/* PP5022/24 and MCF5250 have larger IRAM */
+#define IBSS_ATTR_LARGE_IRAM IBSS_ATTR
+#else
+/* other CPUs IRAM is not large enough */
+#define IBSS_ATTR_LARGE_IRAM
+#endif
+
 /* These structures are needed to store the parsed gain control data. */
 typedef struct {
     int   num_gain_data;
@@ -75,6 +83,5 @@ typedef struct {
 int atrac3_decode_init(ATRAC3Context *q, RMContext *rmctx);
 
 int atrac3_decode_frame(RMContext *rmctx, ATRAC3Context *q,
-                        void *data, int *data_size, 
-                        const uint8_t *buf, int buf_size);
+                        int *data_size, const uint8_t *buf, int buf_size);
 
diff --git a/apps/codecs/libatrac/fixp_math.h b/apps/codecs/libatrac/fixp_math.h
index 88cb5e4b66..5174cc7cc6 100644
--- a/apps/codecs/libatrac/fixp_math.h
+++ b/apps/codecs/libatrac/fixp_math.h
@@ -36,17 +36,38 @@
            : "r"(X),"r"(Y)); \
         low; \
      })
-     
-    #define fixmul32(X,Y) \
-     ({ \
-        int32_t low; \
-        int32_t high; \
-        asm volatile (                   /* calculates: result = (X*Y)>>32 */ \
-           "smull  %0,%1,%2,%3 \n\t"     /* 64 = 32x32 multiply */ \
-           : "=&r"(low), "=&r" (high) \
-           : "r"(X),"r"(Y)); \
-        high; \
-     })
+#elif defined(CPU_COLDFIRE)
+    #define fixmul16(X,Y) \
+    ({ \
+        int32_t t1, t2; \
+        asm volatile ( \
+            "mac.l   %[x],%[y],%%acc0\n\t" /* multiply */ \
+            "mulu.l  %[y],%[x]   \n\t"     /* get lower half, avoid emac stall */ \
+            "movclr.l %%acc0,%[t1]   \n\t" /* get higher half */ \
+            "moveq.l #15,%[t2]   \n\t" \
+            "asl.l   %[t2],%[t1] \n\t"     /* hi <<= 15, plus one free */ \
+            "moveq.l #16,%[t2]   \n\t" \
+            "lsr.l   %[t2],%[x]  \n\t"     /* (unsigned)lo >>= 16 */ \
+            "or.l    %[x],%[t1]  \n\t"     /* combine result */ \
+            : /* outputs */ \
+            [t1]"=&d"(t1), \
+            [t2]"=&d"(t2) \
+            : /* inputs */ \
+            [x] "d" ((X)), \
+            [y] "d" ((Y))); \
+        t1; \
+    })
+
+    #define fixmul31(X,Y) \
+    ({ \
+       int32_t t; \
+       asm volatile ( \
+          "mac.l %[x], %[y], %%acc0\n\t"   /* multiply */ \
+          "movclr.l %%acc0, %[t]\n\t"      /* get higher half as result */ \
+          : [t] "=d" (t) \
+          : [x] "r" ((X)), [y] "r" ((Y))); \
+       t; \
+    })
 #else
     static inline int32_t fixmul16(int32_t x, int32_t y)
     {
@@ -69,17 +90,6 @@
     
         return (int32_t)temp;
     }
-    
-    static inline int32_t fixmul32(int32_t x, int32_t y)
-    {
-        int64_t temp;
-        temp = x;
-        temp *= y;
-    
-        temp >>= 32;        //16+31-16 = 31 bits
-    
-        return (int32_t)temp;
-    }
 #endif
 
 static inline int32_t fixdiv16(int32_t x, int32_t y)
@@ -104,13 +114,13 @@ static inline int32_t fastSqrt(int32_t n)
    /*
     * Logically, these are unsigned. 
     * We need the sign bit to test
-    *	whether (op - res - one) underflowed.
+    * whether (op - res - one) underflowed.
     */
     int32_t op, res, one;
     op = n;
     res = 0;
     /* "one" starts at the highest power of four <= than the argument. */
-    one = 1 << 30;	/* second-to-top bit set */
+    one = 1 << 30; /* second-to-top bit set */
     while (one > op) one >>= 2;
     while (one != 0) 
     {
-- 
2.11.4.GIT