apps/codecs/libspc/spc_dsp.c

   1 /***************************************************************************
   2  *             __________               __   ___.
   3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7  *                     \/            \/     \/    \/            \/
   8  * $Id$
   9  *
  10  * Copyright (C) 2007-2008 Michael Sevakis (jhMikeS)
  11  * Copyright (C) 2006-2007 Adam Gashlin (hcs)
  12  * Copyright (C) 2004-2007 Shay Green (blargg)
  13  * Copyright (C) 2002 Brad Martin
  14  *
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  21  * KIND, either express or implied.
  22  *
  23  ****************************************************************************/
  24
  25 /* The DSP portion (awe!) */
  26 #include "codeclib.h"
  27 #include "spc_codec.h"
  28 #include "spc_profiler.h"
  29
  30 #if defined(CPU_COLDFIRE) || defined (CPU_ARM)
  31 int32_t fir_buf[FIR_BUF_CNT]
  32     __attribute__ ((aligned (FIR_BUF_ALIGN*1))) IBSS_ATTR;
  33 #endif
  34 #if SPC_BRRCACHE
  35 /* a little extra for samples that go past end */
  36 int16_t BRRcache [BRR_CACHE_SIZE] CACHEALIGN_ATTR;
  37 #endif
  38
  39 void DSP_write( struct Spc_Dsp* this, int i, int data )
  40 {
  41     assert( (unsigned) i < REGISTER_COUNT );
  42
  43     this->r.reg [i] = data;
  44     int high = i >> 4;
  45     int low  = i & 0x0F;
  46     if ( low < 2 ) /* voice volumes */
  47     {
  48         int left  = *(int8_t const*) &this->r.reg [i & ~1];
  49         int right = *(int8_t const*) &this->r.reg [i |  1];
  50         struct voice_t* v = this->voice_state + high;
  51         v->volume [0] = left;
  52         v->volume [1] = right;
  53     }
  54     else if ( low == 0x0F ) /* fir coefficients */
  55     {
  56         this->fir_coeff [7 - high] = (int8_t) data; /* sign-extend */
  57     }
  58 }
  59
  60 /* if ( n < -32768 ) out = -32768; */
  61 /* if ( n >  32767 ) out =  32767; */
  62 #define CLAMP16( n ) \
  63 ({                              \
  64     if ( (int16_t) n != n )     \
  65         n = 0x7FFF ^ (n >> 31); \
  66     n;                          \
  67 })
  68
  69 #if SPC_BRRCACHE
  70 static void decode_brr( struct Spc_Dsp* this, unsigned start_addr,
  71                         struct voice_t* voice,
  72                         struct raw_voice_t const* const raw_voice ) ICODE_ATTR;
  73 static void decode_brr( struct Spc_Dsp* this, unsigned start_addr,
  74                         struct voice_t* voice,
  75                         struct raw_voice_t const* const raw_voice )
  76 {
  77     /* setup same variables as where decode_brr() is called from */
  78     #undef RAM
  79     #define RAM ram.ram
  80     struct src_dir const* const sd =
  81         (struct src_dir*) &RAM [this->r.g.wave_page * 0x100];
  82     struct cache_entry_t* const wave_entry =
  83         &this->wave_entry [raw_voice->waveform];
  84
  85     /* the following block can be put in place of the call to
  86        decode_brr() below
  87     */
  88     {
  89         DEBUGF( "decode at %08x (wave #%d)\n",
  90                 start_addr, raw_voice->waveform );
  91
  92         /* see if in cache */
  93         int i;
  94         for ( i = 0; i < this->oldsize; i++ )
  95         {
  96             struct cache_entry_t* e = &this->wave_entry_old [i];
  97             if ( e->start_addr == start_addr )
  98             {
  99                 DEBUGF( "found in wave_entry_old (oldsize=%d)\n",
 100                     this->oldsize );
 101                 *wave_entry = *e;
 102                 goto wave_in_cache;
 103             }
 104         }
 105
 106         wave_entry->start_addr = start_addr;
 107
 108         uint8_t const* const loop_ptr =
 109             RAM + GET_LE16A( sd [raw_voice->waveform].loop );
 110         short* loop_start = 0;
 111
 112         short* out = BRRcache + start_addr * 2;
 113         wave_entry->samples = out;
 114         *out++ = 0;
 115         int smp1 = 0;
 116         int smp2 = 0;
 117
 118         uint8_t const* addr = RAM + start_addr;
 119         int block_header;
 120         do
 121         {
 122             if ( addr == loop_ptr )
 123             {
 124                 loop_start = out;
 125                 DEBUGF( "loop at %08lx (wave #%d)\n",
 126                         (unsigned long)(addr - RAM), raw_voice->waveform );
 127             }
 128
 129             /* header */
 130             block_header = *addr;
 131             addr += 9;
 132             voice->addr = addr;
 133             int const filter = (block_header & 0x0C) - 0x08;
 134
 135             /* scaling
 136                (invalid scaling gives -4096 for neg nybble, 0 for pos) */
 137             static unsigned char const right_shifts [16] = {
 138                 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,  4,  4, 29, 29, 29,
 139             };
 140             static unsigned char const left_shifts  [16] = {
 141                 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11
 142             };
 143             int const scale = block_header >> 4;
 144             int const right_shift = right_shifts [scale];
 145             int const left_shift  = left_shifts  [scale];
 146
 147             /* output position */
 148             out += BRR_BLOCK_SIZE;
 149             int offset = -BRR_BLOCK_SIZE << 2;
 150
 151             do /* decode and filter 16 samples */
 152             {
 153                 /* Get nybble, sign-extend, then scale
 154                    get byte, select which nybble, sign-extend, then shift based
 155                    on scaling. also handles invalid scaling values. */
 156                 int delta = (int) (int8_t) (addr [offset >> 3] << (offset & 4))
 157                         >> right_shift << left_shift;
 158
 159                 out [offset >> 2] = smp2;
 160
 161                 if ( filter == 0 ) /* mode 0x08 (30-90% of the time) */
 162                 {
 163                     delta -= smp2 >> 1;
 164                     delta += smp2 >> 5;
 165                     smp2 = smp1;
 166                     delta += smp1;
 167                     delta += (-smp1 - (smp1 >> 1)) >> 5;
 168                 }
 169                 else
 170                 {
 171                     if ( filter == -4 ) /* mode 0x04 */
 172                     {
 173                         delta += smp1 >> 1;
 174                         delta += (-smp1) >> 5;
 175                     }
 176                     else if ( filter > -4 ) /* mode 0x0C */
 177                     {
 178                         delta -= smp2 >> 1;
 179                         delta += (smp2 + (smp2 >> 1)) >> 4;
 180                         delta += smp1;
 181                         delta += (-smp1 * 13) >> 7;
 182                     }
 183                     smp2 = smp1;
 184                 }
 185
 186                 delta = CLAMP16( delta );
 187                 smp1 = (int16_t) (delta * 2); /* sign-extend */
 188             }
 189             while ( (offset += 4) != 0 );
 190
 191             /* if next block has end flag set, this block ends early */
 192             /* (verified) */
 193             if ( (block_header & 3) != 3 && (*addr & 3) == 1 )
 194             {
 195                 /* skip last 9 samples */
 196                 out -= 9;
 197                 goto early_end;
 198             }
 199         }
 200         while ( !(block_header & 1) && addr < RAM + 0x10000 );
 201
 202         out [0] = smp2;
 203         out [1] = smp1;
 204
 205     early_end:
 206         wave_entry->end = (out - 1 - wave_entry->samples) << 12;
 207
 208         wave_entry->loop = 0;
 209         if ( (block_header & 2) )
 210         {
 211             if ( loop_start )
 212             {
 213                 int loop = out - loop_start;
 214                 wave_entry->loop = loop;
 215                 wave_entry->end += 0x3000;
 216                 out [2] = loop_start [2];
 217                 out [3] = loop_start [3];
 218                 out [4] = loop_start [4];
 219             }
 220             else
 221             {
 222                 DEBUGF( "loop point outside initial wave\n" );
 223             }
 224         }
 225
 226         DEBUGF( "end at %08lx (wave #%d)\n",
 227                 (unsigned long)(addr - RAM), raw_voice->waveform );
 228
 229         /* add to cache */
 230         this->wave_entry_old [this->oldsize++] = *wave_entry;
 231 wave_in_cache:;
 232     }
 233 }
 234 #endif
 235
 236 static void key_on(struct Spc_Dsp* const this, struct voice_t* const voice,
 237                    struct src_dir const* const sd,
 238                    struct raw_voice_t const* const raw_voice,
 239                    const int key_on_delay, const int vbit) ICODE_ATTR;
 240 static void key_on(struct Spc_Dsp* const this, struct voice_t* const voice,
 241                    struct src_dir const* const sd,
 242                    struct raw_voice_t const* const raw_voice,
 243                    const int key_on_delay, const int vbit) {
 244     #undef RAM
 245     #define RAM ram.ram
 246     int const env_rate_init = 0x7800;
 247     voice->key_on_delay = key_on_delay;
 248     if ( key_on_delay == 0 )
 249     {
 250         this->keys_down |= vbit;
 251         voice->envx         = 0;
 252         voice->env_mode     = state_attack;
 253         voice->env_timer    = env_rate_init; /* TODO: inaccurate? */
 254         unsigned start_addr = GET_LE16A(sd [raw_voice->waveform].start);
 255         #if !SPC_BRRCACHE
 256         {
 257             voice->addr = RAM + start_addr;
 258             /* BRR filter uses previous samples */
 259             voice->samples [BRR_BLOCK_SIZE + 1] = 0;
 260             voice->samples [BRR_BLOCK_SIZE + 2] = 0;
 261             /* decode three samples immediately */
 262             voice->position     = (BRR_BLOCK_SIZE + 3) * 0x1000 - 1;
 263             voice->block_header = 0; /* "previous" BRR header */
 264         }
 265         #else
 266         {
 267             voice->position = 3 * 0x1000 - 1;
 268             struct cache_entry_t* const wave_entry =
 269                 &this->wave_entry [raw_voice->waveform];
 270
 271             /* predecode BRR if not already */
 272             if ( wave_entry->start_addr != start_addr )
 273             {
 274                 /* the following line can be replaced by the indicated block
 275                    in decode_brr() */
 276                 decode_brr( this, start_addr, voice, raw_voice );
 277             }
 278
 279             voice->samples   = wave_entry->samples;
 280             voice->wave_end  = wave_entry->end;
 281                     voice->wave_loop = wave_entry->loop;
 282         }
 283         #endif
 284     }
 285 }
 286
 287 void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
 288 {
 289     #undef RAM
 290 #ifdef CPU_ARM
 291     uint8_t* const ram_ = ram.ram;
 292     #define RAM ram_
 293 #else
 294     #define RAM ram.ram
 295 #endif
 296 #if 0
 297     EXIT_TIMER(cpu);
 298     ENTER_TIMER(dsp);
 299 #endif
 300
 301     /* Here we check for keys on/off.  Docs say that successive writes
 302        to KON/KOF must be separated by at least 2 Ts periods or risk
 303        being neglected.  Therefore DSP only looks at these during an
 304        update, and not at the time of the write.  Only need to do this
 305        once however, since the regs haven't changed over the whole
 306        period we need to catch up with. */
 307
 308     {
 309         int key_ons  = this->r.g.key_ons;
 310         int key_offs = this->r.g.key_offs;
 311         /* keying on a voice resets that bit in ENDX */
 312         this->r.g.wave_ended &= ~key_ons;
 313         /* key_off bits prevent key_on from being acknowledged */
 314         this->r.g.key_ons = key_ons & key_offs;
 315
 316         /* process key events outside loop, since they won't re-occur */
 317         struct voice_t* voice = this->voice_state + 8;
 318         int vbit = 0x80;
 319         do
 320         {
 321             --voice;
 322             if ( key_offs & vbit )
 323             {
 324                 voice->env_mode     = state_release;
 325                 voice->key_on_delay = 0;
 326             }
 327             else if ( key_ons & vbit )
 328             {
 329                 voice->key_on_delay = 8;
 330             }
 331         }
 332         while ( (vbit >>= 1) != 0 );
 333     }
 334
 335     struct src_dir const* const sd =
 336         (struct src_dir*) &RAM [this->r.g.wave_page * 0x100];
 337
 338     #ifdef ROCKBOX_BIG_ENDIAN
 339         /* Convert endiannesses before entering loops - these
 340            get used alot */
 341         const uint32_t rates[VOICE_COUNT] =
 342         {
 343             GET_LE16A( this->r.voice[0].rate ) & 0x3FFF,
 344             GET_LE16A( this->r.voice[1].rate ) & 0x3FFF,
 345             GET_LE16A( this->r.voice[2].rate ) & 0x3FFF,
 346             GET_LE16A( this->r.voice[3].rate ) & 0x3FFF,
 347             GET_LE16A( this->r.voice[4].rate ) & 0x3FFF,
 348             GET_LE16A( this->r.voice[5].rate ) & 0x3FFF,
 349             GET_LE16A( this->r.voice[6].rate ) & 0x3FFF,
 350             GET_LE16A( this->r.voice[7].rate ) & 0x3FFF,
 351         };
 352         #define VOICE_RATE(x) *(x)
 353         #define IF_RBE(...) __VA_ARGS__
 354     #ifdef CPU_COLDFIRE
 355         /* Initialize mask register with the buffer address mask */
 356         asm volatile ("move.l %[m], %%mask" : : [m]"i"(FIR_BUF_MASK));
 357         const int echo_wrap  = (this->r.g.echo_delay & 15) * 0x800;
 358         const int echo_start = this->r.g.echo_page * 0x100;
 359     #endif /* CPU_COLDFIRE */
 360     #else
 361         #define VOICE_RATE(x) (INT16A(raw_voice->rate) & 0x3FFF)
 362         #define IF_RBE(...)
 363     #endif /* ROCKBOX_BIG_ENDIAN */
 364
 365 #if !SPC_NOINTERP
 366     int const slow_gaussian = (this->r.g.pitch_mods >> 1) |
 367         this->r.g.noise_enables;
 368 #endif
 369     /* (g.flags & 0x40) ? 30 : 14 */
 370     int const global_muting = ((this->r.g.flags & 0x40) >> 2) + 14 - 8;
 371     int const global_vol_0  = this->r.g.volume_0;
 372     int const global_vol_1  = this->r.g.volume_1;
 373
 374     /* each rate divides exactly into 0x7800 without remainder */
 375     int const env_rate_init = 0x7800;
 376     static unsigned short const env_rates [0x20] ICONST_ATTR =
 377     {
 378         0x0000, 0x000F, 0x0014, 0x0018, 0x001E, 0x0028, 0x0030, 0x003C,
 379         0x0050, 0x0060, 0x0078, 0x00A0, 0x00C0, 0x00F0, 0x0140, 0x0180,
 380         0x01E0, 0x0280, 0x0300, 0x03C0, 0x0500, 0x0600, 0x0780, 0x0A00,
 381         0x0C00, 0x0F00, 0x1400, 0x1800, 0x1E00, 0x2800, 0x3C00, 0x7800
 382     };
 383
 384     do /* one pair of output samples per iteration */
 385     {
 386         /* Noise */
 387         if ( this->r.g.noise_enables )
 388         {
 389             if ( (this->noise_count -=
 390                  env_rates [this->r.g.flags & 0x1F]) <= 0 )
 391             {
 392                 this->noise_count = env_rate_init;
 393                 int feedback = (this->noise << 13) ^ (this->noise << 14);
 394                 this->noise = (feedback & 0x8000) ^ (this->noise >> 1 & ~1);
 395             }
 396         }
 397
 398 #if !SPC_NOECHO
 399         int echo_0 = 0;
 400         int echo_1 = 0;
 401 #endif
 402         long prev_outx = 0; /* TODO: correct value for first channel? */
 403         int chans_0 = 0;
 404         int chans_1 = 0;
 405         /* TODO: put raw_voice pointer in voice_t? */
 406         struct raw_voice_t * raw_voice = this->r.voice;
 407         struct voice_t* voice = this->voice_state;
 408         int vbit = 1;
 409         IF_RBE( const uint32_t* vr = rates; )
 410         for ( ; vbit < 0x100; vbit <<= 1, ++voice, ++raw_voice IF_RBE( , ++vr ) )
 411         {
 412             /* pregen involves checking keyon, etc */
 413 #if 0
 414             ENTER_TIMER(dsp_pregen);
 415 #endif
 416
 417             /* Key on events are delayed */
 418             int key_on_delay = voice->key_on_delay;
 419
 420             if ( --key_on_delay >= 0 ) /* <1% of the time */
 421             {
 422                 key_on(this,voice,sd,raw_voice,key_on_delay,vbit);
 423             }
 424
 425             if ( !(this->keys_down & vbit) ) /* Silent channel */
 426             {
 427         silent_chan:
 428                 raw_voice->envx = 0;
 429                 raw_voice->outx = 0;
 430                 prev_outx = 0;
 431                 continue;
 432             }
 433
 434             /* Envelope */
 435             {
 436                 int const ENV_RANGE = 0x800;
 437                 int env_mode = voice->env_mode;
 438                 int adsr0 = raw_voice->adsr [0];
 439                 int env_timer;
 440                 if ( env_mode != state_release ) /* 99% of the time */
 441                 {
 442                     env_timer = voice->env_timer;
 443                     if ( adsr0 & 0x80 ) /* 79% of the time */
 444                     {
 445                         int adsr1 = raw_voice->adsr [1];
 446                         if ( env_mode == state_sustain ) /* 74% of the time */
 447                         {
 448                             if ( (env_timer -= env_rates [adsr1 & 0x1F]) > 0 )
 449                                 goto write_env_timer;
 450
 451                             int envx = voice->envx;
 452                             envx--; /* envx *= 255 / 256 */
 453                             envx -= envx >> 8;
 454                             voice->envx = envx;
 455                             /* TODO: should this be 8? */
 456                             raw_voice->envx = envx >> 4;
 457                             goto init_env_timer;
 458                         }
 459                         else if ( env_mode < 0 ) /* 25% state_decay */
 460                         {
 461                             int envx = voice->envx;
 462                             if ( (env_timer -=
 463                                 env_rates [(adsr0 >> 3 & 0x0E) + 0x10]) <= 0 )
 464                             {
 465                                 envx--; /* envx *= 255 / 256 */
 466                                 envx -= envx >> 8;
 467                                 voice->envx = envx;
 468                                 /* TODO: should this be 8? */
 469                                 raw_voice->envx = envx >> 4;
 470                                 env_timer = env_rate_init;
 471                             }
 472
 473                             int sustain_level = adsr1 >> 5;
 474                             if ( envx <= (sustain_level + 1) * 0x100 )
 475                                 voice->env_mode = state_sustain;
 476
 477                             goto write_env_timer;
 478                         }
 479                         else /* state_attack */
 480                         {
 481                             int t = adsr0 & 0x0F;
 482                             if ( (env_timer -= env_rates [t * 2 + 1]) > 0 )
 483                                 goto write_env_timer;
 484
 485                             int envx = voice->envx;
 486
 487                             int const step = ENV_RANGE / 64;
 488                             envx += step;
 489                             if ( t == 15 )
 490                                 envx += ENV_RANGE / 2 - step;
 491
 492                             if ( envx >= ENV_RANGE )
 493                             {
 494                                 envx = ENV_RANGE - 1;
 495                                 voice->env_mode = state_decay;
 496                             }
 497                             voice->envx = envx;
 498                             /* TODO: should this be 8? */
 499                             raw_voice->envx = envx >> 4;
 500                             goto init_env_timer;
 501                         }
 502                     }
 503                     else /* gain mode */
 504                     {
 505                         int t = raw_voice->gain;
 506                         if ( t < 0x80 )
 507                         {
 508                             raw_voice->envx = t;
 509                             voice->envx = t << 4;
 510                             goto env_end;
 511                         }
 512                         else
 513                         {
 514                             if ( (env_timer -= env_rates [t & 0x1F]) > 0 )
 515                                 goto write_env_timer;
 516
 517                             int envx = voice->envx;
 518                             int mode = t >> 5;
 519                             if ( mode <= 5 ) /* decay */
 520                             {
 521                                 int step = ENV_RANGE / 64;
 522                                 if ( mode == 5 ) /* exponential */
 523                                 {
 524                                     envx--; /* envx *= 255 / 256 */
 525                                     step = envx >> 8;
 526                                 }
 527                                 if ( (envx -= step) < 0 )
 528                                 {
 529                                     envx = 0;
 530                                     if ( voice->env_mode == state_attack )
 531                                         voice->env_mode = state_decay;
 532                                 }
 533                             }
 534                             else /* attack */
 535                             {
 536                                 int const step = ENV_RANGE / 64;
 537                                 envx += step;
 538                                 if ( mode == 7 &&
 539                                      envx >= ENV_RANGE * 3 / 4 + step )
 540                                     envx += ENV_RANGE / 256 - step;
 541
 542                                 if ( envx >= ENV_RANGE )
 543                                     envx = ENV_RANGE - 1;
 544                             }
 545                             voice->envx = envx;
 546                             /* TODO: should this be 8? */
 547                             raw_voice->envx = envx >> 4;
 548                             goto init_env_timer;
 549                         }
 550                     }
 551                 }
 552                 else /* state_release */
 553                 {
 554                     int envx = voice->envx;
 555                     if ( (envx -= ENV_RANGE / 256) > 0 )
 556                     {
 557                         voice->envx = envx;
 558                         raw_voice->envx = envx >> 8;
 559                         goto env_end;
 560                     }
 561                     else
 562                     {
 563                         /* bit was set, so this clears it */
 564                         this->keys_down ^= vbit;
 565                         voice->envx = 0;
 566                         goto silent_chan;
 567                     }
 568                 }
 569             init_env_timer:
 570                 env_timer = env_rate_init;
 571             write_env_timer:
 572                 voice->env_timer = env_timer;
 573             env_end:;
 574             }
 575 #if 0
 576             EXIT_TIMER(dsp_pregen);
 577
 578             ENTER_TIMER(dsp_gen);
 579 #endif
 580             #if !SPC_BRRCACHE
 581             /* Decode BRR block */
 582             if ( voice->position >= BRR_BLOCK_SIZE * 0x1000 )
 583             {
 584                 voice->position -= BRR_BLOCK_SIZE * 0x1000;
 585
 586                 uint8_t const* addr = voice->addr;
 587                 if ( addr >= RAM + 0x10000 )
 588                     addr -= 0x10000;
 589
 590                 /* action based on previous block's header */
 591                 if ( voice->block_header & 1 )
 592                 {
 593                     addr = RAM + GET_LE16A( sd [raw_voice->waveform].loop );
 594                     this->r.g.wave_ended |= vbit;
 595                     if ( !(voice->block_header & 2) ) /* 1% of the time */
 596                     {
 597                         /* first block was end block;
 598                            don't play anything (verified) */
 599                         /* bit was set, so this clears it */
 600                         this->keys_down ^= vbit;
 601
 602                         /* since voice->envx is 0,
 603                            samples and position don't matter */
 604                         raw_voice->envx = 0;
 605                         voice->envx = 0;
 606                         goto skip_decode;
 607                     }
 608                 }
 609
 610                 /* header */
 611                 int const block_header = *addr;
 612                 addr += 9;
 613                 voice->addr = addr;
 614                 voice->block_header = block_header;
 615                 int const filter = (block_header & 0x0C) - 0x08;
 616
 617                 /* scaling (invalid scaling gives -4096 for neg nybble,
 618                    0 for pos) */
 619                 static unsigned char const right_shifts [16] = {
 620                     5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,  4,  4, 29, 29, 29,
 621                 };
 622                 static unsigned char const left_shifts  [16] = {
 623                     0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11
 624                 };
 625                 int const scale = block_header >> 4;
 626                 int const right_shift = right_shifts [scale];
 627                 int const left_shift  = left_shifts  [scale];
 628
 629                 /* previous samples */
 630                 int smp2 = voice->samples [BRR_BLOCK_SIZE + 1];
 631                 int smp1 = voice->samples [BRR_BLOCK_SIZE + 2];
 632                 voice->samples [0] = voice->samples [BRR_BLOCK_SIZE];
 633
 634                 /* output position */
 635                 short* out = voice->samples + (1 + BRR_BLOCK_SIZE);
 636                 int offset = -BRR_BLOCK_SIZE << 2;
 637
 638                 /* if next block has end flag set,
 639                    this block ends early (verified) */
 640                 if ( (block_header & 3) != 3 && (*addr & 3) == 1 )
 641                 {
 642                     /* arrange for last 9 samples to be skipped */
 643                     int const skip = 9;
 644                     out += (skip & 1);
 645                     voice->samples [skip] = voice->samples [BRR_BLOCK_SIZE];
 646                     voice->position += skip * 0x1000;
 647                     offset = (-BRR_BLOCK_SIZE + (skip & ~1)) << 2;
 648                     addr -= skip / 2;
 649                     /* force sample to end on next decode */
 650                     voice->block_header = 1;
 651                 }
 652
 653                 do /* decode and filter 16 samples */
 654                 {
 655                     /* Get nybble, sign-extend, then scale
 656                        get byte, select which nybble, sign-extend, then shift
 657                        based on scaling. also handles invalid scaling values.*/
 658                     int delta = (int) (int8_t) (addr [offset >> 3] <<
 659                             (offset & 4)) >> right_shift << left_shift;
 660
 661                     out [offset >> 2] = smp2;
 662
 663                     if ( filter == 0 ) /* mode 0x08 (30-90% of the time) */
 664                     {
 665                         delta -= smp2 >> 1;
 666                         delta += smp2 >> 5;
 667                         smp2 = smp1;
 668                         delta += smp1;
 669                         delta += (-smp1 - (smp1 >> 1)) >> 5;
 670                     }
 671                     else
 672                     {
 673                         if ( filter == -4 ) /* mode 0x04 */
 674                         {
 675                             delta += smp1 >> 1;
 676                             delta += (-smp1) >> 5;
 677                         }
 678                         else if ( filter > -4 ) /* mode 0x0C */
 679                         {
 680                             delta -= smp2 >> 1;
 681                             delta += (smp2 + (smp2 >> 1)) >> 4;
 682                             delta += smp1;
 683                             delta += (-smp1 * 13) >> 7;
 684                         }
 685                         smp2 = smp1;
 686                     }
 687
 688                     delta = CLAMP16( delta );
 689                     smp1 = (int16_t) (delta * 2); /* sign-extend */
 690                 }
 691                 while ( (offset += 4) != 0 );
 692
 693                 out [0] = smp2;
 694                 out [1] = smp1;
 695
 696             skip_decode:;
 697             }
 698             #endif
 699
 700             /* Get rate (with possible modulation) */
 701             int rate = VOICE_RATE(vr);
 702             if ( this->r.g.pitch_mods & vbit )
 703                 rate = (rate * (prev_outx + 32768)) >> 15;
 704
 705         #if !SPC_NOINTERP
 706             /* Interleved gauss table (to improve cache coherency). */
 707             /* gauss [i * 2 + j] = normal_gauss [(1 - j) * 256 + i] */
 708             static short const gauss [512] =
 709             {
 710 370,1305, 366,1305, 362,1304, 358,1304, 354,1304, 351,1304, 347,1304, 343,1303,
 711 339,1303, 336,1303, 332,1302, 328,1302, 325,1301, 321,1300, 318,1300, 314,1299,
 712 311,1298, 307,1297, 304,1297, 300,1296, 297,1295, 293,1294, 290,1293, 286,1292,
 713 283,1291, 280,1290, 276,1288, 273,1287, 270,1286, 267,1284, 263,1283, 260,1282,
 714 257,1280, 254,1279, 251,1277, 248,1275, 245,1274, 242,1272, 239,1270, 236,1269,
 715 233,1267, 230,1265, 227,1263, 224,1261, 221,1259, 218,1257, 215,1255, 212,1253,
 716 210,1251, 207,1248, 204,1246, 201,1244, 199,1241, 196,1239, 193,1237, 191,1234,
 717 188,1232, 186,1229, 183,1227, 180,1224, 178,1221, 175,1219, 173,1216, 171,1213,
 718 168,1210, 166,1207, 163,1205, 161,1202, 159,1199, 156,1196, 154,1193, 152,1190,
 719 150,1186, 147,1183, 145,1180, 143,1177, 141,1174, 139,1170, 137,1167, 134,1164,
 720 132,1160, 130,1157, 128,1153, 126,1150, 124,1146, 122,1143, 120,1139, 118,1136,
 721 117,1132, 115,1128, 113,1125, 111,1121, 109,1117, 107,1113, 106,1109, 104,1106,
 722 102,1102, 100,1098,  99,1094,  97,1090,  95,1086,  94,1082,  92,1078,  90,1074,
 723  89,1070,  87,1066,  86,1061,  84,1057,  83,1053,  81,1049,  80,1045,  78,1040,
 724  77,1036,  76,1032,  74,1027,  73,1023,  71,1019,  70,1014,  69,1010,  67,1005,
 725  66,1001,  65, 997,  64, 992,  62, 988,  61, 983,  60, 978,  59, 974,  58, 969,
 726  56, 965,  55, 960,  54, 955,  53, 951,  52, 946,  51, 941,  50, 937,  49, 932,
 727  48, 927,  47, 923,  46, 918,  45, 913,  44, 908,  43, 904,  42, 899,  41, 894,
 728  40, 889,  39, 884,  38, 880,  37, 875,  36, 870,  36, 865,  35, 860,  34, 855,
 729  33, 851,  32, 846,  32, 841,  31, 836,  30, 831,  29, 826,  29, 821,  28, 816,
 730  27, 811,  27, 806,  26, 802,  25, 797,  24, 792,  24, 787,  23, 782,  23, 777,
 731  22, 772,  21, 767,  21, 762,  20, 757,  20, 752,  19, 747,  19, 742,  18, 737,
 732  17, 732,  17, 728,  16, 723,  16, 718,  15, 713,  15, 708,  15, 703,  14, 698,
 733  14, 693,  13, 688,  13, 683,  12, 678,  12, 674,  11, 669,  11, 664,  11, 659,
 734  10, 654,  10, 649,  10, 644,   9, 640,   9, 635,   9, 630,   8, 625,   8, 620,
 735   8, 615,   7, 611,   7, 606,   7, 601,   6, 596,   6, 592,   6, 587,   6, 582,
 736   5, 577,   5, 573,   5, 568,   5, 563,   4, 559,   4, 554,   4, 550,   4, 545,
 737   4, 540,   3, 536,   3, 531,   3, 527,   3, 522,   3, 517,   2, 513,   2, 508,
 738   2, 504,   2, 499,   2, 495,   2, 491,   2, 486,   1, 482,   1, 477,   1, 473,
 739   1, 469,   1, 464,   1, 460,   1, 456,   1, 451,   1, 447,   1, 443,   1, 439,
 740   0, 434,   0, 430,   0, 426,   0, 422,   0, 418,   0, 414,   0, 410,   0, 405,
 741   0, 401,   0, 397,   0, 393,   0, 389,   0, 385,   0, 381,   0, 378,   0, 374,
 742             };
 743             /* Gaussian interpolation using most recent 4 samples */
 744             long position = voice->position;
 745             voice->position += rate;
 746             short const* interp = voice->samples + (position >> 12);
 747             int offset = position >> 4 & 0xFF;
 748
 749             /* Only left half of gaussian kernel is in table, so we must mirror
 750                for right half */
 751             short const* fwd = gauss       + offset * 2;
 752             short const* rev = gauss + 510 - offset * 2;
 753
 754             /* Use faster gaussian interpolation when exact result isn't needed
 755                by pitch modulator of next channel */
 756             int amp_0, amp_1;
 757             if ( !(slow_gaussian & vbit) ) /* 99% of the time */
 758             {
 759                 /* Main optimization is lack of clamping. Not a problem since
 760                    output never goes more than +/- 16 outside 16-bit range and
 761                    things are clamped later anyway. Other optimization is to
 762                    preserve fractional accuracy, eliminating several masks. */
 763                 int output = (((fwd [0] * interp [0] +
 764                          fwd [1] * interp [1] +
 765                          rev [1] * interp [2] +
 766                          rev [0] * interp [3]    ) >> 11) * voice->envx) >> 11;
 767
 768                 /* duplicated here to give compiler more to run in parallel */
 769                 amp_0 = voice->volume [0] * output;
 770                 amp_1 = voice->volume [1] * output;
 771                 raw_voice->outx = output >> 8;
 772             }
 773             else
 774             {
 775                 int output = *(int16_t*) &this->noise;
 776                 if ( !(this->r.g.noise_enables & vbit) )
 777                 {
 778                     output = (fwd [0] * interp [0]) & ~0xFFF;
 779                     output = (output + fwd [1] * interp [1]) & ~0xFFF;
 780                     output = (output + rev [1] * interp [2]) >> 12;
 781                     output = (int16_t) (output * 2);
 782                     output += ((rev [0] * interp [3]) >> 12) * 2;
 783                     output = CLAMP16( output );
 784                 }
 785                 output = (output * voice->envx) >> 11 & ~1;
 786
 787                 /* duplicated here to give compiler more to run in parallel */
 788                 amp_0 = voice->volume [0] * output;
 789                 amp_1 = voice->volume [1] * output;
 790                 prev_outx = output;
 791                 raw_voice->outx = (int8_t) (output >> 8);
 792             }
 793         #else /* SPCNOINTERP */
 794         /* two-point linear interpolation */
 795         #ifdef CPU_COLDFIRE
 796             int amp_0 = (int16_t)this->noise;
 797             int amp_1;
 798
 799             if ( (this->r.g.noise_enables & vbit) == 0 )
 800             {
 801                 uint32_t f = voice->position;
 802                 int32_t y0;
 803
 804                 /**
 805                  * Formula (fastest found so far of MANY):
 806                  * output = y0 + f*y1 - f*y0
 807                  */
 808                 asm volatile (
 809                 /* separate fractional and whole parts   */
 810                 "move.l     %[f], %[y1]               \r\n"
 811                 "and.l      #0xfff, %[f]              \r\n"
 812                 "lsr.l      %[sh], %[y1]              \r\n"
 813                 /* load samples y0 (upper) & y1 (lower)  */
 814                 "move.l     2(%[s], %[y1].l*2), %[y1] \r\n"
 815                 /* %acc0 = f*y1                          */
 816                 "mac.w      %[f]l, %[y1]l, %%acc0     \r\n"
 817                 /* %acc0 -= f*y0                         */
 818                 "msac.w     %[f]l, %[y1]u, %%acc0     \r\n"
 819                 /* separate out y0 and sign extend       */
 820                 "swap       %[y1]                     \r\n"
 821                 "movea.w    %[y1], %[y0]              \r\n"
 822                 /* fetch result, scale down and add y0   */
 823                 "movclr.l   %%acc0, %[y1]             \r\n"
 824                 /* output = y0 + (result >> 12)          */
 825                 "asr.l      %[sh], %[y1]              \r\n"
 826                 "add.l      %[y0], %[y1]              \r\n"
 827                 : [f]"+d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0)
 828                 : [s]"a"(voice->samples), [sh]"d"(12)
 829                     );
 830             }
 831
 832             /* apply voice envelope to output */
 833             asm volatile (
 834             "mac.w %[output]l, %[envx]l, %%acc0 \r\n"
 835             :
 836             : [output]"r"(amp_0), [envx]"r"(voice->envx)
 837             );
 838
 839             /* advance voice position */
 840             voice->position += rate;
 841
 842             /* fetch output, scale and apply left and right
 843                voice volume */
 844             asm volatile (
 845             "movclr.l %%acc0,    %[output]         \r\n"
 846             "asr.l    %[sh],     %[output]         \r\n"
 847             "mac.l    %[vvol_0], %[output], %%acc0 \r\n"
 848             "mac.l    %[vvol_1], %[output], %%acc1 \r\n"
 849             : [output]"=&d"(amp_0)
 850             : [vvol_0]"r"((int)voice->volume[0]),
 851               [vvol_1]"r"((int)voice->volume[1]),
 852               [sh]"d"(11)
 853             );
 854
 855             /* save this output into previous, scale and save in
 856                output register */
 857             prev_outx = amp_0;
 858             raw_voice->outx = amp_0 >> 8;
 859
 860             /* fetch final voice output */
 861             asm volatile (
 862             "movclr.l %%acc0, %[amp_0] \r\n"
 863             "movclr.l %%acc1, %[amp_1] \r\n"
 864             : [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1)
 865             );
 866         #elif defined (CPU_ARM)
 867             int amp_0, amp_1;
 868
 869             if ( (this->r.g.noise_enables & vbit) != 0 ) {
 870                 amp_0 = *(int16_t *)&this->noise;
 871             } else {
 872                 uint32_t f = voice->position;
 873                 amp_0 = (uint32_t)voice->samples;
 874
 875                 asm volatile(
 876                 "mov    %[y1], %[f], lsr #12        \r\n"
 877                 "eor    %[f], %[f], %[y1], lsl #12  \r\n"
 878                 "add    %[y1], %[y0], %[y1], lsl #1 \r\n"
 879                 "ldrsh  %[y0], [%[y1], #2]          \r\n"
 880                 "ldrsh  %[y1], [%[y1], #4]          \r\n"
 881                 "sub    %[y1], %[y1], %[y0]         \r\n"
 882                 "mul    %[f], %[y1], %[f]           \r\n"
 883                 "add    %[y0], %[y0], %[f], asr #12 \r\n"
 884                 : [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1)
 885                 );
 886             }
 887
 888             voice->position += rate;
 889
 890             asm volatile(
 891             "mul    %[amp_1], %[amp_0], %[envx] \r\n"
 892             "mov    %[amp_0], %[amp_1], asr #11 \r\n"
 893             "mov    %[amp_1], %[amp_0], asr #8  \r\n"
 894             : [amp_0]"+r"(amp_0), [amp_1]"=&r"(amp_1)
 895             : [envx]"r"(voice->envx)
 896             );
 897
 898             prev_outx = amp_0;
 899             raw_voice->outx = (int8_t)amp_1;
 900
 901             asm volatile(
 902             "mul    %[amp_1], %[amp_0], %[vol_1] \r\n"
 903             "mul    %[amp_0], %[vol_0], %[amp_0] \r\n"
 904             : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1)
 905             : [vol_0]"r"((int)voice->volume[0]),
 906               [vol_1]"r"((int)voice->volume[1])
 907             );
 908         #else /* Unoptimized CPU */
 909             int output;
 910
 911             if ( (this->r.g.noise_enables & vbit) == 0 )
 912             {
 913                 int const fraction = voice->position & 0xfff;
 914                 short const* const pos = (voice->samples + (voice->position >> 12)) + 1;
 915                 output = pos[0] + ((fraction * (pos[1] - pos[0])) >> 12);
 916             } else {
 917                 output = *(int16_t *)&this->noise;
 918             }
 919
 920             voice->position += rate;
 921
 922             output = (output * voice->envx) >> 11;
 923
 924             /* duplicated here to give compiler more to run in parallel */
 925             int amp_0 = voice->volume [0] * output;
 926             int amp_1 = voice->volume [1] * output;
 927
 928             prev_outx = output;
 929             raw_voice->outx = (int8_t) (output >> 8);
 930         #endif /* CPU_* */
 931         #endif /* SPCNOINTERP */
 932
 933         #if SPC_BRRCACHE
 934             if ( voice->position >= voice->wave_end )
 935             {
 936                 long loop_len = voice->wave_loop << 12;
 937                 voice->position -= loop_len;
 938                 this->r.g.wave_ended |= vbit;
 939                 if ( !loop_len )
 940                 {
 941                     this->keys_down ^= vbit;
 942                     raw_voice->envx = 0;
 943                     voice->envx = 0;
 944                 }
 945             }
 946         #endif
 947 #if 0
 948             EXIT_TIMER(dsp_gen);
 949
 950             ENTER_TIMER(dsp_mix);
 951 #endif
 952             chans_0 += amp_0;
 953             chans_1 += amp_1;
 954             #if !SPC_NOECHO
 955                 if ( this->r.g.echo_ons & vbit )
 956                 {
 957                     echo_0 += amp_0;
 958                     echo_1 += amp_1;
 959                 }
 960             #endif
 961 #if 0
 962             EXIT_TIMER(dsp_mix);
 963 #endif
 964         }
 965         /* end of voice loop */
 966
 967     #if !SPC_NOECHO
 968     #ifdef CPU_COLDFIRE
 969         /* Read feedback from echo buffer */
 970         int echo_pos = this->echo_pos;
 971         uint8_t* const echo_ptr = RAM + ((echo_start + echo_pos) & 0xFFFF);
 972         echo_pos += 4;
 973         if ( echo_pos >= echo_wrap )
 974             echo_pos = 0;
 975         this->echo_pos = echo_pos;
 976         int fb = swap_odd_even32(*(int32_t *)echo_ptr);
 977         int out_0, out_1;
 978
 979         /* Keep last 8 samples */
 980         *this->last_fir_ptr = fb;
 981         this->last_fir_ptr  = this->fir_ptr;
 982
 983         /* Apply echo FIR filter to output samples read from echo buffer -
 984            circular buffer is hardware incremented and masked; FIR
 985            coefficients and buffer history are loaded in parallel with
 986            multiply accumulate operations. Shift left by one here and once
 987            again when calculating feedback to have sample values justified
 988            to bit 31 in the output to ease endian swap, interleaving and
 989            clamping before placing result in the program's echo buffer. */
 990         int _0, _1, _2;
 991         asm volatile (
 992         "move.l                           (%[fir_c])  , %[_2]         \r\n"
 993         "mac.w      %[fb]u, %[_2]u, <<,   (%[fir_p])+&, %[_0], %%acc0 \r\n"
 994         "mac.w      %[fb]l, %[_2]u, <<,   (%[fir_p])& , %[_1], %%acc1 \r\n"
 995         "mac.w      %[_0]u, %[_2]l, <<                       , %%acc0 \r\n"
 996         "mac.w      %[_0]l, %[_2]l, <<,  4(%[fir_c])  , %[_2], %%acc1 \r\n"
 997         "mac.w      %[_1]u, %[_2]u, <<,  4(%[fir_p])& , %[_0], %%acc0 \r\n"
 998         "mac.w      %[_1]l, %[_2]u, <<,  8(%[fir_p])& , %[_1], %%acc1 \r\n"
 999         "mac.w      %[_0]u, %[_2]l, <<                       , %%acc0 \r\n"
1000         "mac.w      %[_0]l, %[_2]l, <<,  8(%[fir_c])  , %[_2], %%acc1 \r\n"
1001         "mac.w      %[_1]u, %[_2]u, <<, 12(%[fir_p])& , %[_0], %%acc0 \r\n"
1002         "mac.w      %[_1]l, %[_2]u, <<, 16(%[fir_p])& , %[_1], %%acc1 \r\n"
1003         "mac.w      %[_0]u, %[_2]l, <<                       , %%acc0 \r\n"
1004         "mac.w      %[_0]l, %[_2]l, <<, 12(%[fir_c])  , %[_2], %%acc1 \r\n"
1005         "mac.w      %[_1]u, %[_2]u, <<, 20(%[fir_p])& , %[_0], %%acc0 \r\n"
1006         "mac.w      %[_1]l, %[_2]u, <<                       , %%acc1 \r\n"
1007         "mac.w      %[_0]u, %[_2]l, <<                       , %%acc0 \r\n"
1008         "mac.w      %[_0]l, %[_2]l, <<                       , %%acc1 \r\n"
1009         : [_0]"=&r"(_0), [_1]"=&r"(_1), [_2]"=&r"(_2),
1010           [fir_p]"+a"(this->fir_ptr)
1011         : [fir_c]"a"(this->fir_coeff), [fb]"r"(fb)
1012         );
1013
1014         /* Generate output */
1015         asm volatile (
1016         /* fetch filter results _after_ gcc loads asm
1017            block parameters to eliminate emac stalls   */
1018         "movclr.l   %%acc0, %[out_0]                \r\n"
1019         "movclr.l   %%acc1, %[out_1]                \r\n"
1020         /* apply global volume                         */
1021         "mac.l      %[chans_0], %[gv_0]    , %%acc2 \r\n"
1022         "mac.l      %[chans_1], %[gv_1]    , %%acc3 \r\n"
1023         /* apply echo volume and add to final output   */
1024         "mac.l      %[ev_0],   %[out_0], >>, %%acc2 \r\n"
1025         "mac.l      %[ev_1],   %[out_1], >>, %%acc3 \r\n"
1026         : [out_0]"=&r"(out_0), [out_1]"=&r"(out_1)
1027         : [chans_0]"r"(chans_0), [gv_0]"r"(global_vol_0),
1028           [ev_0]"r"((int)this->r.g.echo_volume_0),
1029           [chans_1]"r"(chans_1), [gv_1]"r"(global_vol_1),
1030           [ev_1]"r"((int)this->r.g.echo_volume_1)
1031         );
1032
1033         /* Feedback into echo buffer */
1034         if ( !(this->r.g.flags & 0x20) )
1035         {
1036             asm volatile (
1037             /* scale echo voices; saturate if overflow */
1038             "mac.l      %[sh], %[e1]       , %%acc1 \r\n"
1039             "mac.l      %[sh], %[e0]       , %%acc0 \r\n"
1040             /* add scaled output from FIR filter       */
1041             "mac.l      %[out_1], %[ef], <<, %%acc1 \r\n"
1042             "mac.l      %[out_0], %[ef], <<, %%acc0 \r\n"
1043             /* swap and fetch feedback results - simply
1044                swap_odd_even32 mixed in between macs and
1045                movclrs to mitigate stall issues        */
1046             "move.l     #0x00ff00ff, %[sh]          \r\n"
1047             "movclr.l   %%acc1, %[e1]               \r\n"
1048             "swap       %[e1]                       \r\n"
1049             "movclr.l   %%acc0, %[e0]               \r\n"
1050             "move.w     %[e1], %[e0]                \r\n"
1051             "and.l      %[e0], %[sh]                \r\n"
1052             "eor.l      %[sh], %[e0]                \r\n"
1053             "lsl.l      #8, %[sh]                   \r\n"
1054             "lsr.l      #8, %[e0]                   \r\n"
1055             "or.l       %[sh], %[e0]                \r\n"
1056             /* save final feedback into echo buffer    */
1057             "move.l     %[e0], (%[echo_ptr])        \r\n"
1058             : [e0]"+d"(echo_0), [e1]"+d"(echo_1)
1059             : [out_0]"r"(out_0), [out_1]"r"(out_1),
1060               [ef]"r"((int)this->r.g.echo_feedback),
1061               [echo_ptr]"a"((int32_t *)echo_ptr),
1062               [sh]"d"(1 << 9)
1063             );
1064         }
1065
1066         /* Output final samples */
1067         asm volatile (
1068         /* fetch output saved in %acc2 and %acc3 */
1069         "movclr.l   %%acc2, %[out_0] \r\n"
1070         "movclr.l   %%acc3, %[out_1] \r\n"
1071         /* scale right by global_muting shift    */
1072         "asr.l      %[gm],  %[out_0] \r\n"
1073         "asr.l      %[gm],  %[out_1] \r\n"
1074         : [out_0]"=&d"(out_0), [out_1]"=&d"(out_1)
1075         : [gm]"d"(global_muting)
1076         );
1077
1078         out_buf [             0] = out_0;
1079         out_buf [WAV_CHUNK_SIZE] = out_1;
1080         out_buf ++;
1081     #elif defined (CPU_ARM)
1082         /* Read feedback from echo buffer */
1083         int echo_pos = this->echo_pos;
1084         uint8_t* const echo_ptr = RAM +
1085                 ((this->r.g.echo_page * 0x100 + echo_pos) & 0xFFFF);
1086         echo_pos += 4;
1087         if ( echo_pos >= (this->r.g.echo_delay & 15) * 0x800 )
1088             echo_pos = 0;
1089         this->echo_pos = echo_pos;
1090
1091         int fb_0 = GET_LE16SA( echo_ptr     );
1092         int fb_1 = GET_LE16SA( echo_ptr + 2 );
1093
1094         /* Keep last 8 samples */
1095         int32_t *fir_ptr = this->fir_ptr;
1096
1097         /* Apply FIR */
1098         asm volatile (
1099         "str    %[fb_0], [%[fir_p]], #4  \r\n"
1100         "str    %[fb_1], [%[fir_p]], #4  \r\n"
1101         /* duplicate at +8 eliminates wrap checking below */
1102         "str    %[fb_0], [%[fir_p], #56] \r\n"
1103         "str    %[fb_1], [%[fir_p], #60] \r\n"
1104         : [fir_p]"+r"(fir_ptr)
1105         : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1)
1106         );
1107
1108         this->fir_ptr = (int32_t *)((intptr_t)fir_ptr & FIR_BUF_MASK);
1109         int32_t *fir_coeff = this->fir_coeff;
1110
1111         asm volatile (
1112         "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
1113         "ldmia  %[fir_p]!, { r4-r5 }     \r\n"
1114         "mul    %[fb_0],     r0, %[fb_0] \r\n"
1115         "mul    %[fb_1],     r0, %[fb_1] \r\n"
1116         "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
1117         "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
1118         "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
1119         "ldmia  %[fir_p]!, { r2-r5 }     \r\n"
1120         "mla    %[fb_0], r2, r0, %[fb_0] \r\n"
1121         "mla    %[fb_1], r3, r0, %[fb_1] \r\n"
1122         "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
1123         "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
1124         "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
1125         "ldmia  %[fir_p]!, { r2-r5 }     \r\n"
1126         "mla    %[fb_0], r2, r0, %[fb_0] \r\n"
1127         "mla    %[fb_1], r3, r0, %[fb_1] \r\n"
1128         "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
1129         "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
1130         "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
1131         "ldmia  %[fir_p]!, { r2-r5 }     \r\n"
1132         "mla    %[fb_0], r2, r0, %[fb_0] \r\n"
1133         "mla    %[fb_1], r3, r0, %[fb_1] \r\n"
1134         "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
1135         "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
1136         : [fb_0]"+r"(fb_0), [fb_1]"+r"(fb_1),
1137           [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff)
1138         :
1139         : "r0", "r1", "r2", "r3", "r4", "r5"
1140         );
1141
1142         /* Generate output */
1143         int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0)
1144                     >> global_muting;
1145         int amp_1 = (chans_1 * global_vol_1 + fb_1 * this->r.g.echo_volume_1)
1146                     >> global_muting;
1147
1148         out_buf [             0] = amp_0;
1149         out_buf [WAV_CHUNK_SIZE] = amp_1;
1150         out_buf ++;
1151
1152         if ( !(this->r.g.flags & 0x20) )
1153         {
1154             /* Feedback into echo buffer */
1155             int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
1156             int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
1157             e0 = CLAMP16( e0 );
1158             SET_LE16A( echo_ptr    , e0 );
1159             e1 = CLAMP16( e1 );
1160             SET_LE16A( echo_ptr + 2, e1 );
1161         }
1162     #else /* Unoptimized CPU */
1163         /* Read feedback from echo buffer */
1164         int echo_pos = this->echo_pos;
1165         uint8_t* const echo_ptr = RAM +
1166                 ((this->r.g.echo_page * 0x100 + echo_pos) & 0xFFFF);
1167         echo_pos += 4;
1168         if ( echo_pos >= (this->r.g.echo_delay & 15) * 0x800 )
1169             echo_pos = 0;
1170         this->echo_pos = echo_pos;
1171         int fb_0 = GET_LE16SA( echo_ptr     );
1172         int fb_1 = GET_LE16SA( echo_ptr + 2 );
1173
1174         /* Keep last 8 samples */
1175         int (* const fir_ptr) [2] = this->fir_buf + this->fir_pos;
1176         this->fir_pos = (this->fir_pos + 1) & (FIR_BUF_HALF - 1);
1177         fir_ptr [           0] [0] = fb_0;
1178         fir_ptr [           0] [1] = fb_1;
1179         /* duplicate at +8 eliminates wrap checking below */
1180         fir_ptr [FIR_BUF_HALF] [0] = fb_0;
1181         fir_ptr [FIR_BUF_HALF] [1] = fb_1;
1182
1183         /* Apply FIR */
1184         fb_0 *= this->fir_coeff [0];
1185         fb_1 *= this->fir_coeff [0];
1186
1187         #define DO_PT( i )\
1188             fb_0 += fir_ptr [i] [0] * this->fir_coeff [i];\
1189             fb_1 += fir_ptr [i] [1] * this->fir_coeff [i];
1190
1191         DO_PT( 1 )
1192         DO_PT( 2 )
1193         DO_PT( 3 )
1194         DO_PT( 4 )
1195         DO_PT( 5 )
1196         DO_PT( 6 )
1197         DO_PT( 7 )
1198
1199         /* Generate output */
1200         int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0)
1201                     >> global_muting;
1202         int amp_1 = (chans_1 * global_vol_1 + fb_1 * this->r.g.echo_volume_1)
1203                     >> global_muting;
1204         out_buf [             0] = amp_0;
1205         out_buf [WAV_CHUNK_SIZE] = amp_1;
1206         out_buf ++;
1207
1208         if ( !(this->r.g.flags & 0x20) )
1209         {
1210             /* Feedback into echo buffer */
1211             int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
1212             int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
1213             e0 = CLAMP16( e0 );
1214             SET_LE16A( echo_ptr    , e0 );
1215             e1 = CLAMP16( e1 );
1216             SET_LE16A( echo_ptr + 2, e1 );
1217         }
1218     #endif /* CPU_* */
1219     #else /* SPCNOECHO == 1*/
1220         /* Generate output  */
1221         int amp_0 = (chans_0 * global_vol_0) >> global_muting;
1222         int amp_1 = (chans_1 * global_vol_1) >> global_muting;
1223         out_buf [             0] = amp_0;
1224         out_buf [WAV_CHUNK_SIZE] = amp_1;
1225         out_buf ++;
1226     #endif /* SPCNOECHO */
1227     }
1228     while ( --count );
1229 #if 0
1230     EXIT_TIMER(dsp);
1231     ENTER_TIMER(cpu);
1232 #endif
1233 }
1234
1235 void DSP_reset( struct Spc_Dsp* this )
1236 {
1237     this->keys_down   = 0;
1238     this->echo_pos    = 0;
1239     this->noise_count = 0;
1240     this->noise       = 2;
1241
1242     this->r.g.flags   = 0xE0; /* reset, mute, echo off */
1243     this->r.g.key_ons = 0;
1244
1245     ci->memset( this->voice_state, 0, sizeof this->voice_state );
1246
1247     int i;
1248     for ( i = VOICE_COUNT; --i >= 0; )
1249     {
1250         struct voice_t* v = this->voice_state + i;
1251         v->env_mode = state_release;
1252         v->addr     = ram.ram;
1253     }
1254
1255     #if SPC_BRRCACHE
1256         this->oldsize = 0;
1257         for ( i = 0; i < 256; i++ )
1258             this->wave_entry [i].start_addr = -1;
1259     #endif
1260
1261 #if defined(CPU_COLDFIRE)
1262     this->fir_ptr = fir_buf;
1263     this->last_fir_ptr = &fir_buf [7];
1264     ci->memset( fir_buf, 0, sizeof fir_buf );
1265 #elif defined (CPU_ARM)
1266     this->fir_ptr = fir_buf;
1267     ci->memset( fir_buf, 0, sizeof fir_buf );
1268 #else
1269     this->fir_pos = 0;
1270     ci->memset( this->fir_buf, 0, sizeof this->fir_buf );
1271 #endif
1272
1273     assert( offsetof (struct globals_t,unused9 [2]) == REGISTER_COUNT );
1274     assert( sizeof (this->r.voice) == REGISTER_COUNT );
1275 }