apps/codecs/libspc/spc_dsp.c

   1 /***************************************************************************
   2  *             __________               __   ___.
   3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7  *                     \/            \/     \/    \/            \/
   8  * $Id$
   9  *
  10  * Copyright (C) 2007-2008 Michael Sevakis (jhMikeS)
  11  * Copyright (C) 2006-2007 Adam Gashlin (hcs)
  12  * Copyright (C) 2004-2007 Shay Green (blargg)
  13  * Copyright (C) 2002 Brad Martin
  14  *
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  21  * KIND, either express or implied.
  22  *
  23  ****************************************************************************/
  24
  25 /* The DSP portion (awe!) */
  26 #include "codeclib.h"
  27 #include "spc_codec.h"
  28 #include "spc_profiler.h"
  29
  30 #if defined(CPU_COLDFIRE) || defined (CPU_ARM)
  31 int32_t fir_buf[FIR_BUF_CNT]
  32     __attribute__ ((aligned (FIR_BUF_ALIGN*1))) IBSS_ATTR;
  33 #endif
  34 #if SPC_BRRCACHE
  35 /* a little extra for samples that go past end */
  36 int16_t BRRcache [BRR_CACHE_SIZE] CACHEALIGN_ATTR;
  37 #endif
  38
  39 void DSP_write( struct Spc_Dsp* this, int i, int data )
  40 {
  41     assert( (unsigned) i < REGISTER_COUNT );
  42
  43     this->r.reg [i] = data;
  44     int high = i >> 4;
  45     int low  = i & 0x0F;
  46     if ( low < 2 ) /* voice volumes */
  47     {
  48         int left  = *(int8_t const*) &this->r.reg [i & ~1];
  49         int right = *(int8_t const*) &this->r.reg [i |  1];
  50         struct voice_t* v = this->voice_state + high;
  51         v->volume [0] = left;
  52         v->volume [1] = right;
  53     }
  54     else if ( low == 0x0F ) /* fir coefficients */
  55     {
  56         this->fir_coeff [7 - high] = (int8_t) data; /* sign-extend */
  57     }
  58 }
  59
  60 /* if ( n < -32768 ) out = -32768; */
  61 /* if ( n >  32767 ) out =  32767; */
  62 #define CLAMP16( n ) \
  63 ({                              \
  64     if ( (int16_t) n != n )     \
  65         n = 0x7FFF ^ (n >> 31); \
  66     n;                          \
  67 })
  68
  69 #if SPC_BRRCACHE
  70 static void decode_brr( struct Spc_Dsp* this, unsigned start_addr,
  71                         struct voice_t* voice,
  72                         struct raw_voice_t const* const raw_voice ) ICODE_ATTR;
  73 static void decode_brr( struct Spc_Dsp* this, unsigned start_addr,
  74                         struct voice_t* voice,
  75                         struct raw_voice_t const* const raw_voice )
  76 {
  77     /* setup same variables as where decode_brr() is called from */
  78     #undef RAM
  79     #define RAM ram.ram
  80
  81     struct src_dir const* const sd =
  82         &ram.sd[this->r.g.wave_page * 0x100/sizeof(struct src_dir)];
  83     struct cache_entry_t* const wave_entry =
  84         &this->wave_entry [raw_voice->waveform];
  85
  86    /* the following block can be put in place of the call to
  87        decode_brr() below
  88     */
  89     {
  90         DEBUGF( "decode at %08x (wave #%d)\n",
  91                 start_addr, raw_voice->waveform );
  92
  93         /* see if in cache */
  94         int i;
  95         for ( i = 0; i < this->oldsize; i++ )
  96         {
  97             struct cache_entry_t* e = &this->wave_entry_old [i];
  98             if ( e->start_addr == start_addr )
  99             {
 100                 DEBUGF( "found in wave_entry_old (oldsize=%d)\n",
 101                     this->oldsize );
 102                 *wave_entry = *e;
 103                 goto wave_in_cache;
 104             }
 105         }
 106
 107         wave_entry->start_addr = start_addr;
 108
 109         uint8_t const* const loop_ptr =
 110             RAM + letoh16(sd[raw_voice->waveform].loop);
 111         short* loop_start = 0;
 112
 113         short* out = BRRcache + start_addr * 2;
 114         wave_entry->samples = out;
 115         *out++ = 0;
 116         int smp1 = 0;
 117         int smp2 = 0;
 118
 119         uint8_t const* addr = RAM + start_addr;
 120         int block_header;
 121         do
 122         {
 123             if ( addr == loop_ptr )
 124             {
 125                 loop_start = out;
 126                 DEBUGF( "loop at %08lx (wave #%d)\n",
 127                         (unsigned long)(addr - RAM), raw_voice->waveform );
 128             }
 129
 130             /* header */
 131             block_header = *addr;
 132             addr += 9;
 133             voice->addr = addr;
 134             int const filter = (block_header & 0x0C) - 0x08;
 135
 136             /* scaling
 137                (invalid scaling gives -4096 for neg nybble, 0 for pos) */
 138             static unsigned char const right_shifts [16] = {
 139                 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,  4,  4, 29, 29, 29,
 140             };
 141             static unsigned char const left_shifts  [16] = {
 142                 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11
 143             };
 144             int const scale = block_header >> 4;
 145             int const right_shift = right_shifts [scale];
 146             int const left_shift  = left_shifts  [scale];
 147
 148             /* output position */
 149             out += BRR_BLOCK_SIZE;
 150             int offset = -BRR_BLOCK_SIZE << 2;
 151
 152             do /* decode and filter 16 samples */
 153             {
 154                 /* Get nybble, sign-extend, then scale
 155                    get byte, select which nybble, sign-extend, then shift based
 156                    on scaling. also handles invalid scaling values. */
 157                 int delta = (int) (int8_t) (addr [offset >> 3] << (offset & 4))
 158                         >> right_shift << left_shift;
 159
 160                 out [offset >> 2] = smp2;
 161
 162                 if ( filter == 0 ) /* mode 0x08 (30-90% of the time) */
 163                 {
 164                     delta -= smp2 >> 1;
 165                     delta += smp2 >> 5;
 166                     smp2 = smp1;
 167                     delta += smp1;
 168                     delta += (-smp1 - (smp1 >> 1)) >> 5;
 169                 }
 170                 else
 171                 {
 172                     if ( filter == -4 ) /* mode 0x04 */
 173                     {
 174                         delta += smp1 >> 1;
 175                         delta += (-smp1) >> 5;
 176                     }
 177                     else if ( filter > -4 ) /* mode 0x0C */
 178                     {
 179                         delta -= smp2 >> 1;
 180                         delta += (smp2 + (smp2 >> 1)) >> 4;
 181                         delta += smp1;
 182                         delta += (-smp1 * 13) >> 7;
 183                     }
 184                     smp2 = smp1;
 185                 }
 186
 187                 delta = CLAMP16( delta );
 188                 smp1 = (int16_t) (delta * 2); /* sign-extend */
 189             }
 190             while ( (offset += 4) != 0 );
 191
 192             /* if next block has end flag set, this block ends early */
 193             /* (verified) */
 194             if ( (block_header & 3) != 3 && (*addr & 3) == 1 )
 195             {
 196                 /* skip last 9 samples */
 197                 out -= 9;
 198                 goto early_end;
 199             }
 200         }
 201         while ( !(block_header & 1) && addr < RAM + 0x10000 );
 202
 203         out [0] = smp2;
 204         out [1] = smp1;
 205
 206     early_end:
 207         wave_entry->end = (out - 1 - wave_entry->samples) << 12;
 208
 209         wave_entry->loop = 0;
 210         if ( (block_header & 2) )
 211         {
 212             if ( loop_start )
 213             {
 214                 int loop = out - loop_start;
 215                 wave_entry->loop = loop;
 216                 wave_entry->end += 0x3000;
 217                 out [2] = loop_start [2];
 218                 out [3] = loop_start [3];
 219                 out [4] = loop_start [4];
 220             }
 221             else
 222             {
 223                 DEBUGF( "loop point outside initial wave\n" );
 224             }
 225         }
 226
 227         DEBUGF( "end at %08lx (wave #%d)\n",
 228                 (unsigned long)(addr - RAM), raw_voice->waveform );
 229
 230         /* add to cache */
 231         this->wave_entry_old [this->oldsize++] = *wave_entry;
 232 wave_in_cache:;
 233     }
 234 }
 235 #endif
 236
 237 static void key_on(struct Spc_Dsp* const this, struct voice_t* const voice,
 238                    struct src_dir const* const sd,
 239                    struct raw_voice_t const* const raw_voice,
 240                    const int key_on_delay, const int vbit) ICODE_ATTR;
 241 static void key_on(struct Spc_Dsp* const this, struct voice_t* const voice,
 242                    struct src_dir const* const sd,
 243                    struct raw_voice_t const* const raw_voice,
 244                    const int key_on_delay, const int vbit) {
 245     #undef RAM
 246     #define RAM ram.ram
 247     int const env_rate_init = 0x7800;
 248     voice->key_on_delay = key_on_delay;
 249     if ( key_on_delay == 0 )
 250     {
 251         this->keys_down |= vbit;
 252         voice->envx         = 0;
 253         voice->env_mode     = state_attack;
 254         voice->env_timer    = env_rate_init; /* TODO: inaccurate? */
 255         unsigned start_addr = letoh16(sd[raw_voice->waveform].start);
 256         #if !SPC_BRRCACHE
 257         {
 258             voice->addr = RAM + start_addr;
 259             /* BRR filter uses previous samples */
 260             voice->samples [BRR_BLOCK_SIZE + 1] = 0;
 261             voice->samples [BRR_BLOCK_SIZE + 2] = 0;
 262             /* decode three samples immediately */
 263             voice->position     = (BRR_BLOCK_SIZE + 3) * 0x1000 - 1;
 264             voice->block_header = 0; /* "previous" BRR header */
 265         }
 266         #else
 267         {
 268             voice->position = 3 * 0x1000 - 1;
 269             struct cache_entry_t* const wave_entry =
 270                 &this->wave_entry [raw_voice->waveform];
 271
 272             /* predecode BRR if not already */
 273             if ( wave_entry->start_addr != start_addr )
 274             {
 275                 /* the following line can be replaced by the indicated block
 276                    in decode_brr() */
 277                 decode_brr( this, start_addr, voice, raw_voice );
 278             }
 279
 280             voice->samples   = wave_entry->samples;
 281             voice->wave_end  = wave_entry->end;
 282                     voice->wave_loop = wave_entry->loop;
 283         }
 284         #endif
 285     }
 286 }
 287
 288 void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
 289 {
 290     #undef RAM
 291 #if defined(CPU_ARM) && !SPC_BRRCACHE
 292     uint8_t* const ram_ = ram.ram;
 293     #define RAM ram_
 294 #else
 295     #define RAM ram.ram
 296 #endif
 297 #if 0
 298     EXIT_TIMER(cpu);
 299     ENTER_TIMER(dsp);
 300 #endif
 301
 302     /* Here we check for keys on/off.  Docs say that successive writes
 303        to KON/KOF must be separated by at least 2 Ts periods or risk
 304        being neglected.  Therefore DSP only looks at these during an
 305        update, and not at the time of the write.  Only need to do this
 306        once however, since the regs haven't changed over the whole
 307        period we need to catch up with. */
 308
 309     {
 310         int key_ons  = this->r.g.key_ons;
 311         int key_offs = this->r.g.key_offs;
 312         /* keying on a voice resets that bit in ENDX */
 313         this->r.g.wave_ended &= ~key_ons;
 314         /* key_off bits prevent key_on from being acknowledged */
 315         this->r.g.key_ons = key_ons & key_offs;
 316
 317         /* process key events outside loop, since they won't re-occur */
 318         struct voice_t* voice = this->voice_state + 8;
 319         int vbit = 0x80;
 320         do
 321         {
 322             --voice;
 323             if ( key_offs & vbit )
 324             {
 325                 voice->env_mode     = state_release;
 326                 voice->key_on_delay = 0;
 327             }
 328             else if ( key_ons & vbit )
 329             {
 330                 voice->key_on_delay = 8;
 331             }
 332         }
 333         while ( (vbit >>= 1) != 0 );
 334     }
 335
 336     struct src_dir const* const sd =
 337         &ram.sd[this->r.g.wave_page * 0x100/sizeof(struct src_dir)];
 338
 339     #ifdef ROCKBOX_BIG_ENDIAN
 340         /* Convert endiannesses before entering loops - these
 341            get used alot */
 342         const uint32_t rates[VOICE_COUNT] =
 343         {
 344             GET_LE16A( this->r.voice[0].rate ) & 0x3FFF,
 345             GET_LE16A( this->r.voice[1].rate ) & 0x3FFF,
 346             GET_LE16A( this->r.voice[2].rate ) & 0x3FFF,
 347             GET_LE16A( this->r.voice[3].rate ) & 0x3FFF,
 348             GET_LE16A( this->r.voice[4].rate ) & 0x3FFF,
 349             GET_LE16A( this->r.voice[5].rate ) & 0x3FFF,
 350             GET_LE16A( this->r.voice[6].rate ) & 0x3FFF,
 351             GET_LE16A( this->r.voice[7].rate ) & 0x3FFF,
 352         };
 353         #define VOICE_RATE(x) *(x)
 354         #define IF_RBE(...) __VA_ARGS__
 355     #ifdef CPU_COLDFIRE
 356         /* Initialize mask register with the buffer address mask */
 357         asm volatile ("move.l %[m], %%mask" : : [m]"i"(FIR_BUF_MASK));
 358         const int echo_wrap  = (this->r.g.echo_delay & 15) * 0x800;
 359         const int echo_start = this->r.g.echo_page * 0x100;
 360     #endif /* CPU_COLDFIRE */
 361     #else
 362         #define VOICE_RATE(x) (GET_LE16(raw_voice->rate) & 0x3FFF)
 363         #define IF_RBE(...)
 364     #endif /* ROCKBOX_BIG_ENDIAN */
 365
 366 #if !SPC_NOINTERP
 367     int const slow_gaussian = (this->r.g.pitch_mods >> 1) |
 368         this->r.g.noise_enables;
 369 #endif
 370     /* (g.flags & 0x40) ? 30 : 14 */
 371     int const global_muting = ((this->r.g.flags & 0x40) >> 2) + 14 - 8;
 372     int const global_vol_0  = this->r.g.volume_0;
 373     int const global_vol_1  = this->r.g.volume_1;
 374
 375     /* each rate divides exactly into 0x7800 without remainder */
 376     int const env_rate_init = 0x7800;
 377     static unsigned short const env_rates [0x20] ICONST_ATTR =
 378     {
 379         0x0000, 0x000F, 0x0014, 0x0018, 0x001E, 0x0028, 0x0030, 0x003C,
 380         0x0050, 0x0060, 0x0078, 0x00A0, 0x00C0, 0x00F0, 0x0140, 0x0180,
 381         0x01E0, 0x0280, 0x0300, 0x03C0, 0x0500, 0x0600, 0x0780, 0x0A00,
 382         0x0C00, 0x0F00, 0x1400, 0x1800, 0x1E00, 0x2800, 0x3C00, 0x7800
 383     };
 384
 385     do /* one pair of output samples per iteration */
 386     {
 387         /* Noise */
 388         if ( this->r.g.noise_enables )
 389         {
 390             if ( (this->noise_count -=
 391                  env_rates [this->r.g.flags & 0x1F]) <= 0 )
 392             {
 393                 this->noise_count = env_rate_init;
 394                 int feedback = (this->noise << 13) ^ (this->noise << 14);
 395                 this->noise = (feedback & 0x8000) ^ (this->noise >> 1 & ~1);
 396             }
 397         }
 398
 399 #if !SPC_NOECHO
 400         int echo_0 = 0;
 401         int echo_1 = 0;
 402 #endif
 403         long prev_outx = 0; /* TODO: correct value for first channel? */
 404         int chans_0 = 0;
 405         int chans_1 = 0;
 406         /* TODO: put raw_voice pointer in voice_t? */
 407         struct raw_voice_t * raw_voice = this->r.voice;
 408         struct voice_t* voice = this->voice_state;
 409         int vbit = 1;
 410         IF_RBE( const uint32_t* vr = rates; )
 411         for ( ; vbit < 0x100; vbit <<= 1, ++voice, ++raw_voice IF_RBE( , ++vr ) )
 412         {
 413             /* pregen involves checking keyon, etc */
 414 #if 0
 415             ENTER_TIMER(dsp_pregen);
 416 #endif
 417
 418             /* Key on events are delayed */
 419             int key_on_delay = voice->key_on_delay;
 420
 421             if ( --key_on_delay >= 0 ) /* <1% of the time */
 422             {
 423                 key_on(this,voice,sd,raw_voice,key_on_delay,vbit);
 424             }
 425
 426             if ( !(this->keys_down & vbit) ) /* Silent channel */
 427             {
 428         silent_chan:
 429                 raw_voice->envx = 0;
 430                 raw_voice->outx = 0;
 431                 prev_outx = 0;
 432                 continue;
 433             }
 434
 435             /* Envelope */
 436             {
 437                 int const ENV_RANGE = 0x800;
 438                 int env_mode = voice->env_mode;
 439                 int adsr0 = raw_voice->adsr [0];
 440                 int env_timer;
 441                 if ( env_mode != state_release ) /* 99% of the time */
 442                 {
 443                     env_timer = voice->env_timer;
 444                     if ( adsr0 & 0x80 ) /* 79% of the time */
 445                     {
 446                         int adsr1 = raw_voice->adsr [1];
 447                         if ( env_mode == state_sustain ) /* 74% of the time */
 448                         {
 449                             if ( (env_timer -= env_rates [adsr1 & 0x1F]) > 0 )
 450                                 goto write_env_timer;
 451
 452                             int envx = voice->envx;
 453                             envx--; /* envx *= 255 / 256 */
 454                             envx -= envx >> 8;
 455                             voice->envx = envx;
 456                             /* TODO: should this be 8? */
 457                             raw_voice->envx = envx >> 4;
 458                             goto init_env_timer;
 459                         }
 460                         else if ( env_mode < 0 ) /* 25% state_decay */
 461                         {
 462                             int envx = voice->envx;
 463                             if ( (env_timer -=
 464                                 env_rates [(adsr0 >> 3 & 0x0E) + 0x10]) <= 0 )
 465                             {
 466                                 envx--; /* envx *= 255 / 256 */
 467                                 envx -= envx >> 8;
 468                                 voice->envx = envx;
 469                                 /* TODO: should this be 8? */
 470                                 raw_voice->envx = envx >> 4;
 471                                 env_timer = env_rate_init;
 472                             }
 473
 474                             int sustain_level = adsr1 >> 5;
 475                             if ( envx <= (sustain_level + 1) * 0x100 )
 476                                 voice->env_mode = state_sustain;
 477
 478                             goto write_env_timer;
 479                         }
 480                         else /* state_attack */
 481                         {
 482                             int t = adsr0 & 0x0F;
 483                             if ( (env_timer -= env_rates [t * 2 + 1]) > 0 )
 484                                 goto write_env_timer;
 485
 486                             int envx = voice->envx;
 487
 488                             int const step = ENV_RANGE / 64;
 489                             envx += step;
 490                             if ( t == 15 )
 491                                 envx += ENV_RANGE / 2 - step;
 492
 493                             if ( envx >= ENV_RANGE )
 494                             {
 495                                 envx = ENV_RANGE - 1;
 496                                 voice->env_mode = state_decay;
 497                             }
 498                             voice->envx = envx;
 499                             /* TODO: should this be 8? */
 500                             raw_voice->envx = envx >> 4;
 501                             goto init_env_timer;
 502                         }
 503                     }
 504                     else /* gain mode */
 505                     {
 506                         int t = raw_voice->gain;
 507                         if ( t < 0x80 )
 508                         {
 509                             raw_voice->envx = t;
 510                             voice->envx = t << 4;
 511                             goto env_end;
 512                         }
 513                         else
 514                         {
 515                             if ( (env_timer -= env_rates [t & 0x1F]) > 0 )
 516                                 goto write_env_timer;
 517
 518                             int envx = voice->envx;
 519                             int mode = t >> 5;
 520                             if ( mode <= 5 ) /* decay */
 521                             {
 522                                 int step = ENV_RANGE / 64;
 523                                 if ( mode == 5 ) /* exponential */
 524                                 {
 525                                     envx--; /* envx *= 255 / 256 */
 526                                     step = envx >> 8;
 527                                 }
 528                                 if ( (envx -= step) < 0 )
 529                                 {
 530                                     envx = 0;
 531                                     if ( voice->env_mode == state_attack )
 532                                         voice->env_mode = state_decay;
 533                                 }
 534                             }
 535                             else /* attack */
 536                             {
 537                                 int const step = ENV_RANGE / 64;
 538                                 envx += step;
 539                                 if ( mode == 7 &&
 540                                      envx >= ENV_RANGE * 3 / 4 + step )
 541                                     envx += ENV_RANGE / 256 - step;
 542
 543                                 if ( envx >= ENV_RANGE )
 544                                     envx = ENV_RANGE - 1;
 545                             }
 546                             voice->envx = envx;
 547                             /* TODO: should this be 8? */
 548                             raw_voice->envx = envx >> 4;
 549                             goto init_env_timer;
 550                         }
 551                     }
 552                 }
 553                 else /* state_release */
 554                 {
 555                     int envx = voice->envx;
 556                     if ( (envx -= ENV_RANGE / 256) > 0 )
 557                     {
 558                         voice->envx = envx;
 559                         raw_voice->envx = envx >> 8;
 560                         goto env_end;
 561                     }
 562                     else
 563                     {
 564                         /* bit was set, so this clears it */
 565                         this->keys_down ^= vbit;
 566                         voice->envx = 0;
 567                         goto silent_chan;
 568                     }
 569                 }
 570             init_env_timer:
 571                 env_timer = env_rate_init;
 572             write_env_timer:
 573                 voice->env_timer = env_timer;
 574             env_end:;
 575             }
 576 #if 0
 577             EXIT_TIMER(dsp_pregen);
 578
 579             ENTER_TIMER(dsp_gen);
 580 #endif
 581             #if !SPC_BRRCACHE
 582             /* Decode BRR block */
 583             if ( voice->position >= BRR_BLOCK_SIZE * 0x1000 )
 584             {
 585                 voice->position -= BRR_BLOCK_SIZE * 0x1000;
 586
 587                 uint8_t const* addr = voice->addr;
 588                 if ( addr >= RAM + 0x10000 )
 589                     addr -= 0x10000;
 590
 591                 /* action based on previous block's header */
 592                 if ( voice->block_header & 1 )
 593                 {
 594                     addr = RAM + letoh16(sd[raw_voice->waveform].loop);
 595                     this->r.g.wave_ended |= vbit;
 596                     if ( !(voice->block_header & 2) ) /* 1% of the time */
 597                     {
 598                         /* first block was end block;
 599                            don't play anything (verified) */
 600                         /* bit was set, so this clears it */
 601                         this->keys_down ^= vbit;
 602
 603                         /* since voice->envx is 0,
 604                            samples and position don't matter */
 605                         raw_voice->envx = 0;
 606                         voice->envx = 0;
 607                         goto skip_decode;
 608                     }
 609                 }
 610
 611                 /* header */
 612                 int const block_header = *addr;
 613                 addr += 9;
 614                 voice->addr = addr;
 615                 voice->block_header = block_header;
 616                 int const filter = (block_header & 0x0C) - 0x08;
 617
 618                 /* scaling (invalid scaling gives -4096 for neg nybble,
 619                    0 for pos) */
 620                 static unsigned char const right_shifts [16] = {
 621                     5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,  4,  4, 29, 29, 29,
 622                 };
 623                 static unsigned char const left_shifts  [16] = {
 624                     0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11
 625                 };
 626                 int const scale = block_header >> 4;
 627                 int const right_shift = right_shifts [scale];
 628                 int const left_shift  = left_shifts  [scale];
 629
 630                 /* previous samples */
 631                 int smp2 = voice->samples [BRR_BLOCK_SIZE + 1];
 632                 int smp1 = voice->samples [BRR_BLOCK_SIZE + 2];
 633                 voice->samples [0] = voice->samples [BRR_BLOCK_SIZE];
 634
 635                 /* output position */
 636                 short* out = voice->samples + (1 + BRR_BLOCK_SIZE);
 637                 int offset = -BRR_BLOCK_SIZE << 2;
 638
 639                 /* if next block has end flag set,
 640                    this block ends early (verified) */
 641                 if ( (block_header & 3) != 3 && (*addr & 3) == 1 )
 642                 {
 643                     /* arrange for last 9 samples to be skipped */
 644                     int const skip = 9;
 645                     out += (skip & 1);
 646                     voice->samples [skip] = voice->samples [BRR_BLOCK_SIZE];
 647                     voice->position += skip * 0x1000;
 648                     offset = (-BRR_BLOCK_SIZE + (skip & ~1)) << 2;
 649                     addr -= skip / 2;
 650                     /* force sample to end on next decode */
 651                     voice->block_header = 1;
 652                 }
 653
 654                 do /* decode and filter 16 samples */
 655                 {
 656                     /* Get nybble, sign-extend, then scale
 657                        get byte, select which nybble, sign-extend, then shift
 658                        based on scaling. also handles invalid scaling values.*/
 659                     int delta = (int) (int8_t) (addr [offset >> 3] <<
 660                             (offset & 4)) >> right_shift << left_shift;
 661
 662                     out [offset >> 2] = smp2;
 663
 664                     if ( filter == 0 ) /* mode 0x08 (30-90% of the time) */
 665                     {
 666                         delta -= smp2 >> 1;
 667                         delta += smp2 >> 5;
 668                         smp2 = smp1;
 669                         delta += smp1;
 670                         delta += (-smp1 - (smp1 >> 1)) >> 5;
 671                     }
 672                     else
 673                     {
 674                         if ( filter == -4 ) /* mode 0x04 */
 675                         {
 676                             delta += smp1 >> 1;
 677                             delta += (-smp1) >> 5;
 678                         }
 679                         else if ( filter > -4 ) /* mode 0x0C */
 680                         {
 681                             delta -= smp2 >> 1;
 682                             delta += (smp2 + (smp2 >> 1)) >> 4;
 683                             delta += smp1;
 684                             delta += (-smp1 * 13) >> 7;
 685                         }
 686                         smp2 = smp1;
 687                     }
 688
 689                     delta = CLAMP16( delta );
 690                     smp1 = (int16_t) (delta * 2); /* sign-extend */
 691                 }
 692                 while ( (offset += 4) != 0 );
 693
 694                 out [0] = smp2;
 695                 out [1] = smp1;
 696
 697             skip_decode:;
 698             }
 699             #endif
 700
 701             /* Get rate (with possible modulation) */
 702             int rate = VOICE_RATE(vr);
 703             if ( this->r.g.pitch_mods & vbit )
 704                 rate = (rate * (prev_outx + 32768)) >> 15;
 705
 706         #if !SPC_NOINTERP
 707             /* Interleved gauss table (to improve cache coherency). */
 708             /* gauss [i * 2 + j] = normal_gauss [(1 - j) * 256 + i] */
 709             static short const gauss [512] =
 710             {
 711 370,1305, 366,1305, 362,1304, 358,1304, 354,1304, 351,1304, 347,1304, 343,1303,
 712 339,1303, 336,1303, 332,1302, 328,1302, 325,1301, 321,1300, 318,1300, 314,1299,
 713 311,1298, 307,1297, 304,1297, 300,1296, 297,1295, 293,1294, 290,1293, 286,1292,
 714 283,1291, 280,1290, 276,1288, 273,1287, 270,1286, 267,1284, 263,1283, 260,1282,
 715 257,1280, 254,1279, 251,1277, 248,1275, 245,1274, 242,1272, 239,1270, 236,1269,
 716 233,1267, 230,1265, 227,1263, 224,1261, 221,1259, 218,1257, 215,1255, 212,1253,
 717 210,1251, 207,1248, 204,1246, 201,1244, 199,1241, 196,1239, 193,1237, 191,1234,
 718 188,1232, 186,1229, 183,1227, 180,1224, 178,1221, 175,1219, 173,1216, 171,1213,
 719 168,1210, 166,1207, 163,1205, 161,1202, 159,1199, 156,1196, 154,1193, 152,1190,
 720 150,1186, 147,1183, 145,1180, 143,1177, 141,1174, 139,1170, 137,1167, 134,1164,
 721 132,1160, 130,1157, 128,1153, 126,1150, 124,1146, 122,1143, 120,1139, 118,1136,
 722 117,1132, 115,1128, 113,1125, 111,1121, 109,1117, 107,1113, 106,1109, 104,1106,
 723 102,1102, 100,1098,  99,1094,  97,1090,  95,1086,  94,1082,  92,1078,  90,1074,
 724  89,1070,  87,1066,  86,1061,  84,1057,  83,1053,  81,1049,  80,1045,  78,1040,
 725  77,1036,  76,1032,  74,1027,  73,1023,  71,1019,  70,1014,  69,1010,  67,1005,
 726  66,1001,  65, 997,  64, 992,  62, 988,  61, 983,  60, 978,  59, 974,  58, 969,
 727  56, 965,  55, 960,  54, 955,  53, 951,  52, 946,  51, 941,  50, 937,  49, 932,
 728  48, 927,  47, 923,  46, 918,  45, 913,  44, 908,  43, 904,  42, 899,  41, 894,
 729  40, 889,  39, 884,  38, 880,  37, 875,  36, 870,  36, 865,  35, 860,  34, 855,
 730  33, 851,  32, 846,  32, 841,  31, 836,  30, 831,  29, 826,  29, 821,  28, 816,
 731  27, 811,  27, 806,  26, 802,  25, 797,  24, 792,  24, 787,  23, 782,  23, 777,
 732  22, 772,  21, 767,  21, 762,  20, 757,  20, 752,  19, 747,  19, 742,  18, 737,
 733  17, 732,  17, 728,  16, 723,  16, 718,  15, 713,  15, 708,  15, 703,  14, 698,
 734  14, 693,  13, 688,  13, 683,  12, 678,  12, 674,  11, 669,  11, 664,  11, 659,
 735  10, 654,  10, 649,  10, 644,   9, 640,   9, 635,   9, 630,   8, 625,   8, 620,
 736   8, 615,   7, 611,   7, 606,   7, 601,   6, 596,   6, 592,   6, 587,   6, 582,
 737   5, 577,   5, 573,   5, 568,   5, 563,   4, 559,   4, 554,   4, 550,   4, 545,
 738   4, 540,   3, 536,   3, 531,   3, 527,   3, 522,   3, 517,   2, 513,   2, 508,
 739   2, 504,   2, 499,   2, 495,   2, 491,   2, 486,   1, 482,   1, 477,   1, 473,
 740   1, 469,   1, 464,   1, 460,   1, 456,   1, 451,   1, 447,   1, 443,   1, 439,
 741   0, 434,   0, 430,   0, 426,   0, 422,   0, 418,   0, 414,   0, 410,   0, 405,
 742   0, 401,   0, 397,   0, 393,   0, 389,   0, 385,   0, 381,   0, 378,   0, 374,
 743             };
 744             /* Gaussian interpolation using most recent 4 samples */
 745             long position = voice->position;
 746             voice->position += rate;
 747             short const* interp = voice->samples + (position >> 12);
 748             int offset = position >> 4 & 0xFF;
 749
 750             /* Only left half of gaussian kernel is in table, so we must mirror
 751                for right half */
 752             short const* fwd = gauss       + offset * 2;
 753             short const* rev = gauss + 510 - offset * 2;
 754
 755             /* Use faster gaussian interpolation when exact result isn't needed
 756                by pitch modulator of next channel */
 757             int amp_0, amp_1;
 758             if ( !(slow_gaussian & vbit) ) /* 99% of the time */
 759             {
 760                 /* Main optimization is lack of clamping. Not a problem since
 761                    output never goes more than +/- 16 outside 16-bit range and
 762                    things are clamped later anyway. Other optimization is to
 763                    preserve fractional accuracy, eliminating several masks. */
 764                 int output = (((fwd [0] * interp [0] +
 765                          fwd [1] * interp [1] +
 766                          rev [1] * interp [2] +
 767                          rev [0] * interp [3]    ) >> 11) * voice->envx) >> 11;
 768
 769                 /* duplicated here to give compiler more to run in parallel */
 770                 amp_0 = voice->volume [0] * output;
 771                 amp_1 = voice->volume [1] * output;
 772                 raw_voice->outx = output >> 8;
 773             }
 774             else
 775             {
 776                 int output = *(int16_t*) &this->noise;
 777                 if ( !(this->r.g.noise_enables & vbit) )
 778                 {
 779                     output = (fwd [0] * interp [0]) & ~0xFFF;
 780                     output = (output + fwd [1] * interp [1]) & ~0xFFF;
 781                     output = (output + rev [1] * interp [2]) >> 12;
 782                     output = (int16_t) (output * 2);
 783                     output += ((rev [0] * interp [3]) >> 12) * 2;
 784                     output = CLAMP16( output );
 785                 }
 786                 output = (output * voice->envx) >> 11 & ~1;
 787
 788                 /* duplicated here to give compiler more to run in parallel */
 789                 amp_0 = voice->volume [0] * output;
 790                 amp_1 = voice->volume [1] * output;
 791                 prev_outx = output;
 792                 raw_voice->outx = (int8_t) (output >> 8);
 793             }
 794         #else /* SPCNOINTERP */
 795         /* two-point linear interpolation */
 796         #ifdef CPU_COLDFIRE
 797             int amp_0 = (int16_t)this->noise;
 798             int amp_1;
 799
 800             if ( (this->r.g.noise_enables & vbit) == 0 )
 801             {
 802                 uint32_t f = voice->position;
 803                 int32_t y0;
 804
 805                 /**
 806                  * Formula (fastest found so far of MANY):
 807                  * output = y0 + f*y1 - f*y0
 808                  */
 809                 asm volatile (
 810                 /* separate fractional and whole parts   */
 811                 "move.l     %[f], %[y1]               \r\n"
 812                 "and.l      #0xfff, %[f]              \r\n"
 813                 "lsr.l      %[sh], %[y1]              \r\n"
 814                 /* load samples y0 (upper) & y1 (lower)  */
 815                 "move.l     2(%[s], %[y1].l*2), %[y1] \r\n"
 816                 /* %acc0 = f*y1                          */
 817                 "mac.w      %[f]l, %[y1]l, %%acc0     \r\n"
 818                 /* %acc0 -= f*y0                         */
 819                 "msac.w     %[f]l, %[y1]u, %%acc0     \r\n"
 820                 /* separate out y0 and sign extend       */
 821                 "swap       %[y1]                     \r\n"
 822                 "movea.w    %[y1], %[y0]              \r\n"
 823                 /* fetch result, scale down and add y0   */
 824                 "movclr.l   %%acc0, %[y1]             \r\n"
 825                 /* output = y0 + (result >> 12)          */
 826                 "asr.l      %[sh], %[y1]              \r\n"
 827                 "add.l      %[y0], %[y1]              \r\n"
 828                 : [f]"+d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0)
 829                 : [s]"a"(voice->samples), [sh]"d"(12)
 830                     );
 831             }
 832
 833             /* apply voice envelope to output */
 834             asm volatile (
 835             "mac.w %[output]l, %[envx]l, %%acc0 \r\n"
 836             :
 837             : [output]"r"(amp_0), [envx]"r"(voice->envx)
 838             );
 839
 840             /* advance voice position */
 841             voice->position += rate;
 842
 843             /* fetch output, scale and apply left and right
 844                voice volume */
 845             asm volatile (
 846             "movclr.l %%acc0,    %[output]         \r\n"
 847             "asr.l    %[sh],     %[output]         \r\n"
 848             "mac.l    %[vvol_0], %[output], %%acc0 \r\n"
 849             "mac.l    %[vvol_1], %[output], %%acc1 \r\n"
 850             : [output]"=&d"(amp_0)
 851             : [vvol_0]"r"((int)voice->volume[0]),
 852               [vvol_1]"r"((int)voice->volume[1]),
 853               [sh]"d"(11)
 854             );
 855
 856             /* save this output into previous, scale and save in
 857                output register */
 858             prev_outx = amp_0;
 859             raw_voice->outx = amp_0 >> 8;
 860
 861             /* fetch final voice output */
 862             asm volatile (
 863             "movclr.l %%acc0, %[amp_0] \r\n"
 864             "movclr.l %%acc1, %[amp_1] \r\n"
 865             : [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1)
 866             );
 867         #elif defined (CPU_ARM)
 868             int amp_0, amp_1;
 869
 870             if ( (this->r.g.noise_enables & vbit) != 0 ) {
 871                 amp_0 = *(int16_t *)&this->noise;
 872             } else {
 873                 uint32_t f = voice->position;
 874                 amp_0 = (uint32_t)voice->samples;
 875
 876                 asm volatile(
 877                 "mov    %[y1], %[f], lsr #12        \r\n"
 878                 "eor    %[f], %[f], %[y1], lsl #12  \r\n"
 879                 "add    %[y1], %[y0], %[y1], lsl #1 \r\n"
 880                 "ldrsh  %[y0], [%[y1], #2]          \r\n"
 881                 "ldrsh  %[y1], [%[y1], #4]          \r\n"
 882                 "sub    %[y1], %[y1], %[y0]         \r\n"
 883                 "mul    %[f], %[y1], %[f]           \r\n"
 884                 "add    %[y0], %[y0], %[f], asr #12 \r\n"
 885                 : [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1)
 886                 );
 887             }
 888
 889             voice->position += rate;
 890
 891             asm volatile(
 892             "mul    %[amp_1], %[amp_0], %[envx] \r\n"
 893             "mov    %[amp_0], %[amp_1], asr #11 \r\n"
 894             "mov    %[amp_1], %[amp_0], asr #8  \r\n"
 895             : [amp_0]"+r"(amp_0), [amp_1]"=&r"(amp_1)
 896             : [envx]"r"(voice->envx)
 897             );
 898
 899             prev_outx = amp_0;
 900             raw_voice->outx = (int8_t)amp_1;
 901
 902             asm volatile(
 903             "mul    %[amp_1], %[amp_0], %[vol_1] \r\n"
 904             "mul    %[amp_0], %[vol_0], %[amp_0] \r\n"
 905             : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1)
 906             : [vol_0]"r"((int)voice->volume[0]),
 907               [vol_1]"r"((int)voice->volume[1])
 908             );
 909         #else /* Unoptimized CPU */
 910             int output;
 911
 912             if ( (this->r.g.noise_enables & vbit) == 0 )
 913             {
 914                 int const fraction = voice->position & 0xfff;
 915                 short const* const pos = (voice->samples + (voice->position >> 12)) + 1;
 916                 output = pos[0] + ((fraction * (pos[1] - pos[0])) >> 12);
 917             } else {
 918                 output = *(int16_t *)&this->noise;
 919             }
 920
 921             voice->position += rate;
 922
 923             output = (output * voice->envx) >> 11;
 924
 925             /* duplicated here to give compiler more to run in parallel */
 926             int amp_0 = voice->volume [0] * output;
 927             int amp_1 = voice->volume [1] * output;
 928
 929             prev_outx = output;
 930             raw_voice->outx = (int8_t) (output >> 8);
 931         #endif /* CPU_* */
 932         #endif /* SPCNOINTERP */
 933
 934         #if SPC_BRRCACHE
 935             if ( voice->position >= voice->wave_end )
 936             {
 937                 long loop_len = voice->wave_loop << 12;
 938                 voice->position -= loop_len;
 939                 this->r.g.wave_ended |= vbit;
 940                 if ( !loop_len )
 941                 {
 942                     this->keys_down ^= vbit;
 943                     raw_voice->envx = 0;
 944                     voice->envx = 0;
 945                 }
 946             }
 947         #endif
 948 #if 0
 949             EXIT_TIMER(dsp_gen);
 950
 951             ENTER_TIMER(dsp_mix);
 952 #endif
 953             chans_0 += amp_0;
 954             chans_1 += amp_1;
 955             #if !SPC_NOECHO
 956                 if ( this->r.g.echo_ons & vbit )
 957                 {
 958                     echo_0 += amp_0;
 959                     echo_1 += amp_1;
 960                 }
 961             #endif
 962 #if 0
 963             EXIT_TIMER(dsp_mix);
 964 #endif
 965         }
 966         /* end of voice loop */
 967
 968     #if !SPC_NOECHO
 969     #ifdef CPU_COLDFIRE
 970         /* Read feedback from echo buffer */
 971         int echo_pos = this->echo_pos;
 972         uint8_t* const echo_ptr = RAM + ((echo_start + echo_pos) & 0xFFFF);
 973         echo_pos += 4;
 974         if ( echo_pos >= echo_wrap )
 975             echo_pos = 0;
 976         this->echo_pos = echo_pos;
 977         int fb = swap_odd_even32(*(int32_t *)echo_ptr);
 978         int out_0, out_1;
 979
 980         /* Keep last 8 samples */
 981         *this->last_fir_ptr = fb;
 982         this->last_fir_ptr  = this->fir_ptr;
 983
 984         /* Apply echo FIR filter to output samples read from echo buffer -
 985            circular buffer is hardware incremented and masked; FIR
 986            coefficients and buffer history are loaded in parallel with
 987            multiply accumulate operations. Shift left by one here and once
 988            again when calculating feedback to have sample values justified
 989            to bit 31 in the output to ease endian swap, interleaving and
 990            clamping before placing result in the program's echo buffer. */
 991         int _0, _1, _2;
 992         asm volatile (
 993         "move.l                           (%[fir_c])  , %[_2]         \r\n"
 994         "mac.w      %[fb]u, %[_2]u, <<,   (%[fir_p])+&, %[_0], %%acc0 \r\n"
 995         "mac.w      %[fb]l, %[_2]u, <<,   (%[fir_p])& , %[_1], %%acc1 \r\n"
 996         "mac.w      %[_0]u, %[_2]l, <<                       , %%acc0 \r\n"
 997         "mac.w      %[_0]l, %[_2]l, <<,  4(%[fir_c])  , %[_2], %%acc1 \r\n"
 998         "mac.w      %[_1]u, %[_2]u, <<,  4(%[fir_p])& , %[_0], %%acc0 \r\n"
 999         "mac.w      %[_1]l, %[_2]u, <<,  8(%[fir_p])& , %[_1], %%acc1 \r\n"
1000         "mac.w      %[_0]u, %[_2]l, <<                       , %%acc0 \r\n"
1001         "mac.w      %[_0]l, %[_2]l, <<,  8(%[fir_c])  , %[_2], %%acc1 \r\n"
1002         "mac.w      %[_1]u, %[_2]u, <<, 12(%[fir_p])& , %[_0], %%acc0 \r\n"
1003         "mac.w      %[_1]l, %[_2]u, <<, 16(%[fir_p])& , %[_1], %%acc1 \r\n"
1004         "mac.w      %[_0]u, %[_2]l, <<                       , %%acc0 \r\n"
1005         "mac.w      %[_0]l, %[_2]l, <<, 12(%[fir_c])  , %[_2], %%acc1 \r\n"
1006         "mac.w      %[_1]u, %[_2]u, <<, 20(%[fir_p])& , %[_0], %%acc0 \r\n"
1007         "mac.w      %[_1]l, %[_2]u, <<                       , %%acc1 \r\n"
1008         "mac.w      %[_0]u, %[_2]l, <<                       , %%acc0 \r\n"
1009         "mac.w      %[_0]l, %[_2]l, <<                       , %%acc1 \r\n"
1010         : [_0]"=&r"(_0), [_1]"=&r"(_1), [_2]"=&r"(_2),
1011           [fir_p]"+a"(this->fir_ptr)
1012         : [fir_c]"a"(this->fir_coeff), [fb]"r"(fb)
1013         );
1014
1015         /* Generate output */
1016         asm volatile (
1017         /* fetch filter results _after_ gcc loads asm
1018            block parameters to eliminate emac stalls   */
1019         "movclr.l   %%acc0, %[out_0]                \r\n"
1020         "movclr.l   %%acc1, %[out_1]                \r\n"
1021         /* apply global volume                         */
1022         "mac.l      %[chans_0], %[gv_0]    , %%acc2 \r\n"
1023         "mac.l      %[chans_1], %[gv_1]    , %%acc3 \r\n"
1024         /* apply echo volume and add to final output   */
1025         "mac.l      %[ev_0],   %[out_0], >>, %%acc2 \r\n"
1026         "mac.l      %[ev_1],   %[out_1], >>, %%acc3 \r\n"
1027         : [out_0]"=&r"(out_0), [out_1]"=&r"(out_1)
1028         : [chans_0]"r"(chans_0), [gv_0]"r"(global_vol_0),
1029           [ev_0]"r"((int)this->r.g.echo_volume_0),
1030           [chans_1]"r"(chans_1), [gv_1]"r"(global_vol_1),
1031           [ev_1]"r"((int)this->r.g.echo_volume_1)
1032         );
1033
1034         /* Feedback into echo buffer */
1035         if ( !(this->r.g.flags & 0x20) )
1036         {
1037             asm volatile (
1038             /* scale echo voices; saturate if overflow */
1039             "mac.l      %[sh], %[e1]       , %%acc1 \r\n"
1040             "mac.l      %[sh], %[e0]       , %%acc0 \r\n"
1041             /* add scaled output from FIR filter       */
1042             "mac.l      %[out_1], %[ef], <<, %%acc1 \r\n"
1043             "mac.l      %[out_0], %[ef], <<, %%acc0 \r\n"
1044             /* swap and fetch feedback results - simply
1045                swap_odd_even32 mixed in between macs and
1046                movclrs to mitigate stall issues        */
1047             "move.l     #0x00ff00ff, %[sh]          \r\n"
1048             "movclr.l   %%acc1, %[e1]               \r\n"
1049             "swap       %[e1]                       \r\n"
1050             "movclr.l   %%acc0, %[e0]               \r\n"
1051             "move.w     %[e1], %[e0]                \r\n"
1052             "and.l      %[e0], %[sh]                \r\n"
1053             "eor.l      %[sh], %[e0]                \r\n"
1054             "lsl.l      #8, %[sh]                   \r\n"
1055             "lsr.l      #8, %[e0]                   \r\n"
1056             "or.l       %[sh], %[e0]                \r\n"
1057             /* save final feedback into echo buffer    */
1058             "move.l     %[e0], (%[echo_ptr])        \r\n"
1059             : [e0]"+d"(echo_0), [e1]"+d"(echo_1)
1060             : [out_0]"r"(out_0), [out_1]"r"(out_1),
1061               [ef]"r"((int)this->r.g.echo_feedback),
1062               [echo_ptr]"a"((int32_t *)echo_ptr),
1063               [sh]"d"(1 << 9)
1064             );
1065         }
1066
1067         /* Output final samples */
1068         asm volatile (
1069         /* fetch output saved in %acc2 and %acc3 */
1070         "movclr.l   %%acc2, %[out_0] \r\n"
1071         "movclr.l   %%acc3, %[out_1] \r\n"
1072         /* scale right by global_muting shift    */
1073         "asr.l      %[gm],  %[out_0] \r\n"
1074         "asr.l      %[gm],  %[out_1] \r\n"
1075         : [out_0]"=&d"(out_0), [out_1]"=&d"(out_1)
1076         : [gm]"d"(global_muting)
1077         );
1078
1079         out_buf [             0] = out_0;
1080         out_buf [WAV_CHUNK_SIZE] = out_1;
1081         out_buf ++;
1082     #elif defined (CPU_ARM)
1083         /* Read feedback from echo buffer */
1084         int echo_pos = this->echo_pos;
1085         uint8_t* const echo_ptr = RAM +
1086                 ((this->r.g.echo_page * 0x100 + echo_pos) & 0xFFFF);
1087         echo_pos += 4;
1088         if ( echo_pos >= (this->r.g.echo_delay & 15) * 0x800 )
1089             echo_pos = 0;
1090         this->echo_pos = echo_pos;
1091
1092         int fb_0 = GET_LE16SA( echo_ptr     );
1093         int fb_1 = GET_LE16SA( echo_ptr + 2 );
1094
1095         /* Keep last 8 samples */
1096         int32_t *fir_ptr = this->fir_ptr;
1097
1098         /* Apply FIR */
1099         asm volatile (
1100         "str    %[fb_0], [%[fir_p]], #4  \r\n"
1101         "str    %[fb_1], [%[fir_p]], #4  \r\n"
1102         /* duplicate at +8 eliminates wrap checking below */
1103         "str    %[fb_0], [%[fir_p], #56] \r\n"
1104         "str    %[fb_1], [%[fir_p], #60] \r\n"
1105         : [fir_p]"+r"(fir_ptr)
1106         : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1)
1107         );
1108
1109         this->fir_ptr = (int32_t *)((intptr_t)fir_ptr & FIR_BUF_MASK);
1110         int32_t *fir_coeff = this->fir_coeff;
1111
1112         asm volatile (
1113         "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
1114         "ldmia  %[fir_p]!, { r4-r5 }     \r\n"
1115         "mul    %[fb_0],     r0, %[fb_0] \r\n"
1116         "mul    %[fb_1],     r0, %[fb_1] \r\n"
1117         "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
1118         "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
1119         "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
1120         "ldmia  %[fir_p]!, { r2-r5 }     \r\n"
1121         "mla    %[fb_0], r2, r0, %[fb_0] \r\n"
1122         "mla    %[fb_1], r3, r0, %[fb_1] \r\n"
1123         "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
1124         "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
1125         "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
1126         "ldmia  %[fir_p]!, { r2-r5 }     \r\n"
1127         "mla    %[fb_0], r2, r0, %[fb_0] \r\n"
1128         "mla    %[fb_1], r3, r0, %[fb_1] \r\n"
1129         "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
1130         "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
1131         "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
1132         "ldmia  %[fir_p]!, { r2-r5 }     \r\n"
1133         "mla    %[fb_0], r2, r0, %[fb_0] \r\n"
1134         "mla    %[fb_1], r3, r0, %[fb_1] \r\n"
1135         "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
1136         "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
1137         : [fb_0]"+r"(fb_0), [fb_1]"+r"(fb_1),
1138           [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff)
1139         :
1140         : "r0", "r1", "r2", "r3", "r4", "r5"
1141         );
1142
1143         /* Generate output */
1144         int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0)
1145                     >> global_muting;
1146         int amp_1 = (chans_1 * global_vol_1 + fb_1 * this->r.g.echo_volume_1)
1147                     >> global_muting;
1148
1149         out_buf [             0] = amp_0;
1150         out_buf [WAV_CHUNK_SIZE] = amp_1;
1151         out_buf ++;
1152
1153         if ( !(this->r.g.flags & 0x20) )
1154         {
1155             /* Feedback into echo buffer */
1156             int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
1157             int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
1158             e0 = CLAMP16( e0 );
1159             SET_LE16A( echo_ptr    , e0 );
1160             e1 = CLAMP16( e1 );
1161             SET_LE16A( echo_ptr + 2, e1 );
1162         }
1163     #else /* Unoptimized CPU */
1164         /* Read feedback from echo buffer */
1165         int echo_pos = this->echo_pos;
1166         uint8_t* const echo_ptr = RAM +
1167                 ((this->r.g.echo_page * 0x100 + echo_pos) & 0xFFFF);
1168         echo_pos += 4;
1169         if ( echo_pos >= (this->r.g.echo_delay & 15) * 0x800 )
1170             echo_pos = 0;
1171         this->echo_pos = echo_pos;
1172         int fb_0 = GET_LE16SA( echo_ptr     );
1173         int fb_1 = GET_LE16SA( echo_ptr + 2 );
1174
1175         /* Keep last 8 samples */
1176         int (* const fir_ptr) [2] = this->fir_buf + this->fir_pos;
1177         this->fir_pos = (this->fir_pos + 1) & (FIR_BUF_HALF - 1);
1178         fir_ptr [           0] [0] = fb_0;
1179         fir_ptr [           0] [1] = fb_1;
1180         /* duplicate at +8 eliminates wrap checking below */
1181         fir_ptr [FIR_BUF_HALF] [0] = fb_0;
1182         fir_ptr [FIR_BUF_HALF] [1] = fb_1;
1183
1184         /* Apply FIR */
1185         fb_0 *= this->fir_coeff [0];
1186         fb_1 *= this->fir_coeff [0];
1187
1188         #define DO_PT( i )\
1189             fb_0 += fir_ptr [i] [0] * this->fir_coeff [i];\
1190             fb_1 += fir_ptr [i] [1] * this->fir_coeff [i];
1191
1192         DO_PT( 1 )
1193         DO_PT( 2 )
1194         DO_PT( 3 )
1195         DO_PT( 4 )
1196         DO_PT( 5 )
1197         DO_PT( 6 )
1198         DO_PT( 7 )
1199
1200         /* Generate output */
1201         int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0)
1202                     >> global_muting;
1203         int amp_1 = (chans_1 * global_vol_1 + fb_1 * this->r.g.echo_volume_1)
1204                     >> global_muting;
1205         out_buf [             0] = amp_0;
1206         out_buf [WAV_CHUNK_SIZE] = amp_1;
1207         out_buf ++;
1208
1209         if ( !(this->r.g.flags & 0x20) )
1210         {
1211             /* Feedback into echo buffer */
1212             int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
1213             int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
1214             e0 = CLAMP16( e0 );
1215             SET_LE16A( echo_ptr    , e0 );
1216             e1 = CLAMP16( e1 );
1217             SET_LE16A( echo_ptr + 2, e1 );
1218         }
1219     #endif /* CPU_* */
1220     #else /* SPCNOECHO == 1*/
1221         /* Generate output  */
1222         int amp_0 = (chans_0 * global_vol_0) >> global_muting;
1223         int amp_1 = (chans_1 * global_vol_1) >> global_muting;
1224         out_buf [             0] = amp_0;
1225         out_buf [WAV_CHUNK_SIZE] = amp_1;
1226         out_buf ++;
1227     #endif /* SPCNOECHO */
1228     }
1229     while ( --count );
1230 #if 0
1231     EXIT_TIMER(dsp);
1232     ENTER_TIMER(cpu);
1233 #endif
1234 }
1235
1236 void DSP_reset( struct Spc_Dsp* this )
1237 {
1238     this->keys_down   = 0;
1239     this->echo_pos    = 0;
1240     this->noise_count = 0;
1241     this->noise       = 2;
1242
1243     this->r.g.flags   = 0xE0; /* reset, mute, echo off */
1244     this->r.g.key_ons = 0;
1245
1246     ci->memset( this->voice_state, 0, sizeof this->voice_state );
1247
1248     int i;
1249     for ( i = VOICE_COUNT; --i >= 0; )
1250     {
1251         struct voice_t* v = this->voice_state + i;
1252         v->env_mode = state_release;
1253         v->addr     = ram.ram;
1254     }
1255
1256     #if SPC_BRRCACHE
1257         this->oldsize = 0;
1258         for ( i = 0; i < 256; i++ )
1259             this->wave_entry [i].start_addr = -1;
1260     #endif
1261
1262 #if defined(CPU_COLDFIRE)
1263     this->fir_ptr = fir_buf;
1264     this->last_fir_ptr = &fir_buf [7];
1265     ci->memset( fir_buf, 0, sizeof fir_buf );
1266 #elif defined (CPU_ARM)
1267     this->fir_ptr = fir_buf;
1268     ci->memset( fir_buf, 0, sizeof fir_buf );
1269 #else
1270     this->fir_pos = 0;
1271     ci->memset( this->fir_buf, 0, sizeof this->fir_buf );
1272 #endif
1273
1274     assert( offsetof (struct globals_t,unused9 [2]) == REGISTER_COUNT );
1275     assert( sizeof (this->r.voice) == REGISTER_COUNT );
1276 }