apps/codecs/libspc/spc_dsp.c

   1 /***************************************************************************
   2  *             __________               __   ___.
   3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7  *                     \/            \/     \/    \/            \/
   8  * $Id$
   9  *
  10  * Copyright (C) 2007-2008 Michael Sevakis (jhMikeS)
  11  * Copyright (C) 2006-2007 Adam Gashlin (hcs)
  12  * Copyright (C) 2004-2007 Shay Green (blargg)
  13  * Copyright (C) 2002 Brad Martin
  14  *
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  21  * KIND, either express or implied.
  22  *
  23  ****************************************************************************/
  24
  25 /* The DSP portion (awe!) */
  26 #include "codeclib.h"
  27 #include "spc_codec.h"
  28 #include "spc_profiler.h"
  29
  30 #if defined(CPU_COLDFIRE) || defined (CPU_ARM)
  31 int32_t fir_buf[FIR_BUF_CNT]
  32     __attribute__ ((aligned (FIR_BUF_ALIGN*1))) IBSS_ATTR;
  33 #endif
  34 #if SPC_BRRCACHE
  35 /* a little extra for samples that go past end */
  36 int16_t BRRcache [BRR_CACHE_SIZE] CACHEALIGN_ATTR;
  37 #endif
  38
  39 void DSP_write( struct Spc_Dsp* this, int i, int data )
  40 {
  41     assert( (unsigned) i < REGISTER_COUNT );
  42
  43     this->r.reg [i] = data;
  44     int high = i >> 4;
  45     int low  = i & 0x0F;
  46     if ( low < 2 ) /* voice volumes */
  47     {
  48         int left  = *(int8_t const*) &this->r.reg [i & ~1];
  49         int right = *(int8_t const*) &this->r.reg [i |  1];
  50         struct voice_t* v = this->voice_state + high;
  51         v->volume [0] = left;
  52         v->volume [1] = right;
  53     }
  54     else if ( low == 0x0F ) /* fir coefficients */
  55     {
  56         this->fir_coeff [7 - high] = (int8_t) data; /* sign-extend */
  57     }
  58 }
  59
  60 #if ARM_ARCH >= 6
  61 /* if ( n < -32768 ) out = -32768; */
  62 /* if ( n >  32767 ) out =  32767; */
  63 #define CLAMP16( n ) \
  64     ({ \
  65        asm ("ssat %0, #16, %1" \
  66             : "=r" ( n ) : "r"( n ) ); \
  67        n; \
  68     })
  69 #else
  70 /* if ( n < -32768 ) out = -32768; */
  71 /* if ( n >  32767 ) out =  32767; */
  72 #define CLAMP16( n ) \
  73 ({                              \
  74     if ( (int16_t) n != n )     \
  75         n = 0x7FFF ^ (n >> 31); \
  76     n;                          \
  77 })
  78 #endif
  79
  80 #if SPC_BRRCACHE
  81 static void decode_brr( struct Spc_Dsp* this, unsigned start_addr,
  82                         struct voice_t* voice,
  83                         struct raw_voice_t const* const raw_voice ) ICODE_ATTR;
  84 static void decode_brr( struct Spc_Dsp* this, unsigned start_addr,
  85                         struct voice_t* voice,
  86                         struct raw_voice_t const* const raw_voice )
  87 {
  88     /* setup same variables as where decode_brr() is called from */
  89     #undef RAM
  90     #define RAM ram.ram
  91
  92     struct src_dir const* const sd =
  93         &ram.sd[this->r.g.wave_page * 0x100/sizeof(struct src_dir)];
  94     struct cache_entry_t* const wave_entry =
  95         &this->wave_entry [raw_voice->waveform];
  96
  97    /* the following block can be put in place of the call to
  98        decode_brr() below
  99     */
 100     {
 101         DEBUGF( "decode at %08x (wave #%d)\n",
 102                 start_addr, raw_voice->waveform );
 103
 104         /* see if in cache */
 105         int i;
 106         for ( i = 0; i < this->oldsize; i++ )
 107         {
 108             struct cache_entry_t* e = &this->wave_entry_old [i];
 109             if ( e->start_addr == start_addr )
 110             {
 111                 DEBUGF( "found in wave_entry_old (oldsize=%d)\n",
 112                     this->oldsize );
 113                 *wave_entry = *e;
 114                 goto wave_in_cache;
 115             }
 116         }
 117
 118         wave_entry->start_addr = start_addr;
 119
 120         uint8_t const* const loop_ptr =
 121             RAM + letoh16(sd[raw_voice->waveform].loop);
 122         short* loop_start = 0;
 123
 124         short* out = BRRcache + start_addr * 2;
 125         wave_entry->samples = out;
 126         *out++ = 0;
 127         int smp1 = 0;
 128         int smp2 = 0;
 129
 130         uint8_t const* addr = RAM + start_addr;
 131         int block_header;
 132         do
 133         {
 134             if ( addr == loop_ptr )
 135             {
 136                 loop_start = out;
 137                 DEBUGF( "loop at %08lx (wave #%d)\n",
 138                         (unsigned long)(addr - RAM), raw_voice->waveform );
 139             }
 140
 141             /* header */
 142             block_header = *addr;
 143             addr += 9;
 144             voice->addr = addr;
 145             int const filter = (block_header & 0x0C) - 0x08;
 146
 147             /* scaling
 148                (invalid scaling gives -4096 for neg nybble, 0 for pos) */
 149             static unsigned char const right_shifts [16] = {
 150                 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,  4,  4, 29, 29, 29,
 151             };
 152             static unsigned char const left_shifts  [16] = {
 153                 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11
 154             };
 155             int const scale = block_header >> 4;
 156             int const right_shift = right_shifts [scale];
 157             int const left_shift  = left_shifts  [scale];
 158
 159             /* output position */
 160             out += BRR_BLOCK_SIZE;
 161             int offset = -BRR_BLOCK_SIZE << 2;
 162
 163             do /* decode and filter 16 samples */
 164             {
 165                 /* Get nybble, sign-extend, then scale
 166                    get byte, select which nybble, sign-extend, then shift based
 167                    on scaling. also handles invalid scaling values. */
 168                 int delta = (int) (int8_t) (addr [offset >> 3] << (offset & 4))
 169                         >> right_shift << left_shift;
 170
 171                 out [offset >> 2] = smp2;
 172
 173                 if ( filter == 0 ) /* mode 0x08 (30-90% of the time) */
 174                 {
 175                     delta -= smp2 >> 1;
 176                     delta += smp2 >> 5;
 177                     smp2 = smp1;
 178                     delta += smp1;
 179                     delta += (-smp1 - (smp1 >> 1)) >> 5;
 180                 }
 181                 else
 182                 {
 183                     if ( filter == -4 ) /* mode 0x04 */
 184                     {
 185                         delta += smp1 >> 1;
 186                         delta += (-smp1) >> 5;
 187                     }
 188                     else if ( filter > -4 ) /* mode 0x0C */
 189                     {
 190                         delta -= smp2 >> 1;
 191                         delta += (smp2 + (smp2 >> 1)) >> 4;
 192                         delta += smp1;
 193                         delta += (-smp1 * 13) >> 7;
 194                     }
 195                     smp2 = smp1;
 196                 }
 197
 198                 delta = CLAMP16( delta );
 199                 smp1 = (int16_t) (delta * 2); /* sign-extend */
 200             }
 201             while ( (offset += 4) != 0 );
 202
 203             /* if next block has end flag set, this block ends early */
 204             /* (verified) */
 205             if ( (block_header & 3) != 3 && (*addr & 3) == 1 )
 206             {
 207                 /* skip last 9 samples */
 208                 out -= 9;
 209                 goto early_end;
 210             }
 211         }
 212         while ( !(block_header & 1) && addr < RAM + 0x10000 );
 213
 214         out [0] = smp2;
 215         out [1] = smp1;
 216
 217     early_end:
 218         wave_entry->end = (out - 1 - wave_entry->samples) << 12;
 219
 220         wave_entry->loop = 0;
 221         if ( (block_header & 2) )
 222         {
 223             if ( loop_start )
 224             {
 225                 int loop = out - loop_start;
 226                 wave_entry->loop = loop;
 227                 wave_entry->end += 0x3000;
 228                 out [2] = loop_start [2];
 229                 out [3] = loop_start [3];
 230                 out [4] = loop_start [4];
 231             }
 232             else
 233             {
 234                 DEBUGF( "loop point outside initial wave\n" );
 235             }
 236         }
 237
 238         DEBUGF( "end at %08lx (wave #%d)\n",
 239                 (unsigned long)(addr - RAM), raw_voice->waveform );
 240
 241         /* add to cache */
 242         this->wave_entry_old [this->oldsize++] = *wave_entry;
 243 wave_in_cache:;
 244     }
 245 }
 246 #endif
 247
 248 static void key_on(struct Spc_Dsp* const this, struct voice_t* const voice,
 249                    struct src_dir const* const sd,
 250                    struct raw_voice_t const* const raw_voice,
 251                    const int key_on_delay, const int vbit) ICODE_ATTR;
 252 static void key_on(struct Spc_Dsp* const this, struct voice_t* const voice,
 253                    struct src_dir const* const sd,
 254                    struct raw_voice_t const* const raw_voice,
 255                    const int key_on_delay, const int vbit) {
 256     #undef RAM
 257     #define RAM ram.ram
 258     int const env_rate_init = 0x7800;
 259     voice->key_on_delay = key_on_delay;
 260     if ( key_on_delay == 0 )
 261     {
 262         this->keys_down |= vbit;
 263         voice->envx         = 0;
 264         voice->env_mode     = state_attack;
 265         voice->env_timer    = env_rate_init; /* TODO: inaccurate? */
 266         unsigned start_addr = letoh16(sd[raw_voice->waveform].start);
 267         #if !SPC_BRRCACHE
 268         {
 269             voice->addr = RAM + start_addr;
 270             /* BRR filter uses previous samples */
 271             voice->samples [BRR_BLOCK_SIZE + 1] = 0;
 272             voice->samples [BRR_BLOCK_SIZE + 2] = 0;
 273             /* decode three samples immediately */
 274             voice->position     = (BRR_BLOCK_SIZE + 3) * 0x1000 - 1;
 275             voice->block_header = 0; /* "previous" BRR header */
 276         }
 277         #else
 278         {
 279             voice->position = 3 * 0x1000 - 1;
 280             struct cache_entry_t* const wave_entry =
 281                 &this->wave_entry [raw_voice->waveform];
 282
 283             /* predecode BRR if not already */
 284             if ( wave_entry->start_addr != start_addr )
 285             {
 286                 /* the following line can be replaced by the indicated block
 287                    in decode_brr() */
 288                 decode_brr( this, start_addr, voice, raw_voice );
 289             }
 290
 291             voice->samples   = wave_entry->samples;
 292             voice->wave_end  = wave_entry->end;
 293                     voice->wave_loop = wave_entry->loop;
 294         }
 295         #endif
 296     }
 297 }
 298
 299 void DSP_run_( struct Spc_Dsp* this, long count, int32_t* out_buf )
 300 {
 301     #undef RAM
 302 #if defined(CPU_ARM) && !SPC_BRRCACHE
 303     uint8_t* const ram_ = ram.ram;
 304     #define RAM ram_
 305 #else
 306     #define RAM ram.ram
 307 #endif
 308 #if 0
 309     EXIT_TIMER(cpu);
 310     ENTER_TIMER(dsp);
 311 #endif
 312
 313     /* Here we check for keys on/off.  Docs say that successive writes
 314        to KON/KOF must be separated by at least 2 Ts periods or risk
 315        being neglected.  Therefore DSP only looks at these during an
 316        update, and not at the time of the write.  Only need to do this
 317        once however, since the regs haven't changed over the whole
 318        period we need to catch up with. */
 319
 320     {
 321         int key_ons  = this->r.g.key_ons;
 322         int key_offs = this->r.g.key_offs;
 323         /* keying on a voice resets that bit in ENDX */
 324         this->r.g.wave_ended &= ~key_ons;
 325         /* key_off bits prevent key_on from being acknowledged */
 326         this->r.g.key_ons = key_ons & key_offs;
 327
 328         /* process key events outside loop, since they won't re-occur */
 329         struct voice_t* voice = this->voice_state + 8;
 330         int vbit = 0x80;
 331         do
 332         {
 333             --voice;
 334             if ( key_offs & vbit )
 335             {
 336                 voice->env_mode     = state_release;
 337                 voice->key_on_delay = 0;
 338             }
 339             else if ( key_ons & vbit )
 340             {
 341                 voice->key_on_delay = 8;
 342             }
 343         }
 344         while ( (vbit >>= 1) != 0 );
 345     }
 346
 347     struct src_dir const* const sd =
 348         &ram.sd[this->r.g.wave_page * 0x100/sizeof(struct src_dir)];
 349
 350     #ifdef ROCKBOX_BIG_ENDIAN
 351         /* Convert endiannesses before entering loops - these
 352            get used alot */
 353         const uint32_t rates[VOICE_COUNT] =
 354         {
 355             GET_LE16A( this->r.voice[0].rate ) & 0x3FFF,
 356             GET_LE16A( this->r.voice[1].rate ) & 0x3FFF,
 357             GET_LE16A( this->r.voice[2].rate ) & 0x3FFF,
 358             GET_LE16A( this->r.voice[3].rate ) & 0x3FFF,
 359             GET_LE16A( this->r.voice[4].rate ) & 0x3FFF,
 360             GET_LE16A( this->r.voice[5].rate ) & 0x3FFF,
 361             GET_LE16A( this->r.voice[6].rate ) & 0x3FFF,
 362             GET_LE16A( this->r.voice[7].rate ) & 0x3FFF,
 363         };
 364         #define VOICE_RATE(x) *(x)
 365         #define IF_RBE(...) __VA_ARGS__
 366     #ifdef CPU_COLDFIRE
 367         /* Initialize mask register with the buffer address mask */
 368         asm volatile ("move.l %[m], %%mask" : : [m]"i"(FIR_BUF_MASK));
 369         const int echo_wrap  = (this->r.g.echo_delay & 15) * 0x800;
 370         const int echo_start = this->r.g.echo_page * 0x100;
 371     #endif /* CPU_COLDFIRE */
 372     #else
 373         #define VOICE_RATE(x) (GET_LE16(raw_voice->rate) & 0x3FFF)
 374         #define IF_RBE(...)
 375     #endif /* ROCKBOX_BIG_ENDIAN */
 376
 377 #if !SPC_NOINTERP
 378     int const slow_gaussian = (this->r.g.pitch_mods >> 1) |
 379         this->r.g.noise_enables;
 380 #endif
 381     /* (g.flags & 0x40) ? 30 : 14 */
 382     int const global_muting = ((this->r.g.flags & 0x40) >> 2) + 14 - 8;
 383     int const global_vol_0  = this->r.g.volume_0;
 384     int const global_vol_1  = this->r.g.volume_1;
 385
 386     /* each rate divides exactly into 0x7800 without remainder */
 387     int const env_rate_init = 0x7800;
 388     static unsigned short const env_rates [0x20] ICONST_ATTR =
 389     {
 390         0x0000, 0x000F, 0x0014, 0x0018, 0x001E, 0x0028, 0x0030, 0x003C,
 391         0x0050, 0x0060, 0x0078, 0x00A0, 0x00C0, 0x00F0, 0x0140, 0x0180,
 392         0x01E0, 0x0280, 0x0300, 0x03C0, 0x0500, 0x0600, 0x0780, 0x0A00,
 393         0x0C00, 0x0F00, 0x1400, 0x1800, 0x1E00, 0x2800, 0x3C00, 0x7800
 394     };
 395
 396     do /* one pair of output samples per iteration */
 397     {
 398         /* Noise */
 399         if ( this->r.g.noise_enables )
 400         {
 401             if ( (this->noise_count -=
 402                  env_rates [this->r.g.flags & 0x1F]) <= 0 )
 403             {
 404                 this->noise_count = env_rate_init;
 405                 int feedback = (this->noise << 13) ^ (this->noise << 14);
 406                 this->noise = (feedback & 0x8000) ^ (this->noise >> 1 & ~1);
 407             }
 408         }
 409
 410 #if !SPC_NOECHO
 411         int echo_0 = 0;
 412         int echo_1 = 0;
 413 #endif
 414         long prev_outx = 0; /* TODO: correct value for first channel? */
 415         int chans_0 = 0;
 416         int chans_1 = 0;
 417         /* TODO: put raw_voice pointer in voice_t? */
 418         struct raw_voice_t * raw_voice = this->r.voice;
 419         struct voice_t* voice = this->voice_state;
 420         int vbit = 1;
 421         IF_RBE( const uint32_t* vr = rates; )
 422         for ( ; vbit < 0x100; vbit <<= 1, ++voice, ++raw_voice IF_RBE( , ++vr ) )
 423         {
 424             /* pregen involves checking keyon, etc */
 425 #if 0
 426             ENTER_TIMER(dsp_pregen);
 427 #endif
 428
 429             /* Key on events are delayed */
 430             int key_on_delay = voice->key_on_delay;
 431
 432             if ( UNLIKELY ( --key_on_delay >= 0 ) ) /* <1% of the time */
 433             {
 434                 key_on(this,voice,sd,raw_voice,key_on_delay,vbit);
 435             }
 436
 437             if ( !(this->keys_down & vbit) ) /* Silent channel */
 438             {
 439         silent_chan:
 440                 raw_voice->envx = 0;
 441                 raw_voice->outx = 0;
 442                 prev_outx = 0;
 443                 continue;
 444             }
 445
 446             /* Envelope */
 447             {
 448                 int const ENV_RANGE = 0x800;
 449                 int env_mode = voice->env_mode;
 450                 int adsr0 = raw_voice->adsr [0];
 451                 int env_timer;
 452                 if ( LIKELY ( env_mode != state_release ) ) /* 99% of the time */
 453                 {
 454                     env_timer = voice->env_timer;
 455                     if ( LIKELY ( adsr0 & 0x80 ) ) /* 79% of the time */
 456                     {
 457                         int adsr1 = raw_voice->adsr [1];
 458                         if ( LIKELY ( env_mode == state_sustain ) ) /* 74% of the time */
 459                         {
 460                             if ( (env_timer -= env_rates [adsr1 & 0x1F]) > 0 )
 461                                 goto write_env_timer;
 462
 463                             int envx = voice->envx;
 464                             envx--; /* envx *= 255 / 256 */
 465                             envx -= envx >> 8;
 466                             voice->envx = envx;
 467                             /* TODO: should this be 8? */
 468                             raw_voice->envx = envx >> 4;
 469                             goto init_env_timer;
 470                         }
 471                         else if ( env_mode < 0 ) /* 25% state_decay */
 472                         {
 473                             int envx = voice->envx;
 474                             if ( (env_timer -=
 475                                 env_rates [(adsr0 >> 3 & 0x0E) + 0x10]) <= 0 )
 476                             {
 477                                 envx--; /* envx *= 255 / 256 */
 478                                 envx -= envx >> 8;
 479                                 voice->envx = envx;
 480                                 /* TODO: should this be 8? */
 481                                 raw_voice->envx = envx >> 4;
 482                                 env_timer = env_rate_init;
 483                             }
 484
 485                             int sustain_level = adsr1 >> 5;
 486                             if ( envx <= (sustain_level + 1) * 0x100 )
 487                                 voice->env_mode = state_sustain;
 488
 489                             goto write_env_timer;
 490                         }
 491                         else /* state_attack */
 492                         {
 493                             int t = adsr0 & 0x0F;
 494                             if ( (env_timer -= env_rates [t * 2 + 1]) > 0 )
 495                                 goto write_env_timer;
 496
 497                             int envx = voice->envx;
 498
 499                             int const step = ENV_RANGE / 64;
 500                             envx += step;
 501                             if ( t == 15 )
 502                                 envx += ENV_RANGE / 2 - step;
 503
 504                             if ( envx >= ENV_RANGE )
 505                             {
 506                                 envx = ENV_RANGE - 1;
 507                                 voice->env_mode = state_decay;
 508                             }
 509                             voice->envx = envx;
 510                             /* TODO: should this be 8? */
 511                             raw_voice->envx = envx >> 4;
 512                             goto init_env_timer;
 513                         }
 514                     }
 515                     else /* gain mode */
 516                     {
 517                         int t = raw_voice->gain;
 518                         if ( t < 0x80 )
 519                         {
 520                             raw_voice->envx = t;
 521                             voice->envx = t << 4;
 522                             goto env_end;
 523                         }
 524                         else
 525                         {
 526                             if ( (env_timer -= env_rates [t & 0x1F]) > 0 )
 527                                 goto write_env_timer;
 528
 529                             int envx = voice->envx;
 530                             int mode = t >> 5;
 531                             if ( mode <= 5 ) /* decay */
 532                             {
 533                                 int step = ENV_RANGE / 64;
 534                                 if ( mode == 5 ) /* exponential */
 535                                 {
 536                                     envx--; /* envx *= 255 / 256 */
 537                                     step = envx >> 8;
 538                                 }
 539                                 if ( (envx -= step) < 0 )
 540                                 {
 541                                     envx = 0;
 542                                     if ( voice->env_mode == state_attack )
 543                                         voice->env_mode = state_decay;
 544                                 }
 545                             }
 546                             else /* attack */
 547                             {
 548                                 int const step = ENV_RANGE / 64;
 549                                 envx += step;
 550                                 if ( mode == 7 &&
 551                                      envx >= ENV_RANGE * 3 / 4 + step )
 552                                     envx += ENV_RANGE / 256 - step;
 553
 554                                 if ( envx >= ENV_RANGE )
 555                                     envx = ENV_RANGE - 1;
 556                             }
 557                             voice->envx = envx;
 558                             /* TODO: should this be 8? */
 559                             raw_voice->envx = envx >> 4;
 560                             goto init_env_timer;
 561                         }
 562                     }
 563                 }
 564                 else /* state_release */
 565                 {
 566                     int envx = voice->envx;
 567                     if ( (envx -= ENV_RANGE / 256) > 0 )
 568                     {
 569                         voice->envx = envx;
 570                         raw_voice->envx = envx >> 8;
 571                         goto env_end;
 572                     }
 573                     else
 574                     {
 575                         /* bit was set, so this clears it */
 576                         this->keys_down ^= vbit;
 577                         voice->envx = 0;
 578                         goto silent_chan;
 579                     }
 580                 }
 581             init_env_timer:
 582                 env_timer = env_rate_init;
 583             write_env_timer:
 584                 voice->env_timer = env_timer;
 585             env_end:;
 586             }
 587 #if 0
 588             EXIT_TIMER(dsp_pregen);
 589
 590             ENTER_TIMER(dsp_gen);
 591 #endif
 592             #if !SPC_BRRCACHE
 593             /* Decode BRR block */
 594             if ( voice->position >= BRR_BLOCK_SIZE * 0x1000 )
 595             {
 596                 voice->position -= BRR_BLOCK_SIZE * 0x1000;
 597
 598                 uint8_t const* addr = voice->addr;
 599                 if ( addr >= RAM + 0x10000 )
 600                     addr -= 0x10000;
 601
 602                 /* action based on previous block's header */
 603                 if ( voice->block_header & 1 )
 604                 {
 605                     addr = RAM + letoh16(sd[raw_voice->waveform].loop);
 606                     this->r.g.wave_ended |= vbit;
 607                     if ( !(voice->block_header & 2) ) /* 1% of the time */
 608                     {
 609                         /* first block was end block;
 610                            don't play anything (verified) */
 611                         /* bit was set, so this clears it */
 612                         this->keys_down ^= vbit;
 613
 614                         /* since voice->envx is 0,
 615                            samples and position don't matter */
 616                         raw_voice->envx = 0;
 617                         voice->envx = 0;
 618                         goto skip_decode;
 619                     }
 620                 }
 621
 622                 /* header */
 623                 int const block_header = *addr;
 624                 addr += 9;
 625                 voice->addr = addr;
 626                 voice->block_header = block_header;
 627
 628                 /* previous samples */
 629                 int smp2 = voice->samples [BRR_BLOCK_SIZE + 1];
 630                 int smp1 = voice->samples [BRR_BLOCK_SIZE + 2];
 631                 voice->samples [0] = voice->samples [BRR_BLOCK_SIZE];
 632
 633                 /* output position */
 634                 short* out = voice->samples + (1 + BRR_BLOCK_SIZE);
 635                 int offset = -BRR_BLOCK_SIZE << 2;
 636
 637                 /* if next block has end flag set,
 638                    this block ends early (verified) */
 639                 if ( (block_header & 3) != 3 && (*addr & 3) == 1 )
 640                 {
 641                     /* arrange for last 9 samples to be skipped */
 642                     int const skip = 9;
 643                     out += (skip & 1);
 644                     voice->samples [skip] = voice->samples [BRR_BLOCK_SIZE];
 645                     voice->position += skip * 0x1000;
 646                     offset = (-BRR_BLOCK_SIZE + (skip & ~1)) << 2;
 647                     addr -= skip / 2;
 648                     /* force sample to end on next decode */
 649                     voice->block_header = 1;
 650                 }
 651
 652                 int const filter = block_header & 0x0c;
 653                 int const scale = block_header >> 4;
 654
 655                 if ( filter == 0x08 ) /* filter 2 (30-90% of the time) */
 656                 {
 657                     /* y[n] = x[n] + 61/32 * y[n-1] - 15/16 * y[n-2] */
 658                     do /* decode and filter 16 samples */
 659                     {
 660                         /* Get nybble, sign-extend, then scale
 661                            get byte, select which nybble, sign-extend, then shift
 662                            based on scaling. */
 663                         int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
 664                         delta = (delta << scale) >> 1;
 665
 666                         if (scale > 0xc)
 667                             delta = (delta >> 17) << 11;
 668
 669                         out [offset >> 2] = smp2;
 670
 671                         delta -= smp2 >> 1;
 672                         delta += smp2 >> 5;
 673                         delta += smp1;
 674                         delta += (-smp1 - (smp1 >> 1)) >> 5;
 675
 676                         delta = CLAMP16( delta );
 677                         smp2 = smp1;
 678                         smp1 = (int16_t) (delta * 2); /* sign-extend */
 679                     }
 680                     while ( (offset += 4) != 0 );
 681                 }
 682                 else if ( filter == 0x04 ) /* filter 1 */
 683                 {
 684                     /* y[n] = x[n] + 15/16 * y[n-1] */
 685                     do /* decode and filter 16 samples */
 686                     {
 687                         /* Get nybble, sign-extend, then scale
 688                            get byte, select which nybble, sign-extend, then shift
 689                            based on scaling. */
 690                         int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
 691                         delta = (delta << scale) >> 1;
 692
 693                         if (scale > 0xc)
 694                             delta = (delta >> 17) << 11;
 695
 696                         out [offset >> 2] = smp2;
 697
 698                         delta += smp1 >> 1;
 699                         delta += (-smp1) >> 5;
 700
 701                         delta = CLAMP16( delta );
 702                         smp2 = smp1;
 703                         smp1 = (int16_t) (delta * 2); /* sign-extend */
 704                     }
 705                     while ( (offset += 4) != 0 );
 706                 }
 707                 else if ( filter == 0x0c ) /* filter 3 */
 708                 {
 709                     /* y[n] = x[n] + 115/64 * y[n-1] - 13/16 * y[n-2] */
 710                     do /* decode and filter 16 samples */
 711                     {
 712                         /* Get nybble, sign-extend, then scale
 713                            get byte, select which nybble, sign-extend, then shift
 714                            based on scaling. */
 715                         int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
 716                         delta = (delta << scale) >> 1;
 717
 718                         if (scale > 0xc)
 719                             delta = (delta >> 17) << 11;
 720
 721                         out [offset >> 2] = smp2;
 722
 723                         delta -= smp2 >> 1;
 724                         delta += (smp2 + (smp2 >> 1)) >> 4;
 725                         delta += smp1;
 726                         delta += (-smp1 * 13) >> 7;
 727
 728                         delta = CLAMP16( delta );
 729                         smp2 = smp1;
 730                         smp1 = (int16_t) (delta * 2); /* sign-extend */
 731                     }
 732                     while ( (offset += 4) != 0 );
 733                 }
 734                 else /* filter 0 */
 735                 {
 736                     /* y[n] = x[n] */
 737                     do /* decode and filter 16 samples */
 738                     {
 739                         /* Get nybble, sign-extend, then scale
 740                            get byte, select which nybble, sign-extend, then shift
 741                            based on scaling. */
 742                         int delta = (int8_t)(addr [offset >> 3] << (offset & 4)) >> 4;
 743                         delta = (delta << scale) >> 1;
 744
 745                         if (scale > 0xc)
 746                             delta = (delta >> 17) << 11;
 747
 748                         out [offset >> 2] = smp2;
 749
 750                         smp2 = smp1;
 751                         smp1 = delta * 2;
 752                     }
 753                     while ( (offset += 4) != 0 );
 754                 }
 755
 756                 out [0] = smp2;
 757                 out [1] = smp1;
 758
 759             skip_decode:;
 760             }
 761         #endif /* !SPC_BRRCACHE */
 762             /* Get rate (with possible modulation) */
 763             int rate = VOICE_RATE(vr);
 764             if ( this->r.g.pitch_mods & vbit )
 765                 rate = (rate * (prev_outx + 32768)) >> 15;
 766
 767         #if !SPC_NOINTERP
 768             /* Interleved gauss table (to improve cache coherency). */
 769             /* gauss [i * 2 + j] = normal_gauss [(1 - j) * 256 + i] */
 770             static short const gauss [512] =
 771             {
 772 370,1305, 366,1305, 362,1304, 358,1304, 354,1304, 351,1304, 347,1304, 343,1303,
 773 339,1303, 336,1303, 332,1302, 328,1302, 325,1301, 321,1300, 318,1300, 314,1299,
 774 311,1298, 307,1297, 304,1297, 300,1296, 297,1295, 293,1294, 290,1293, 286,1292,
 775 283,1291, 280,1290, 276,1288, 273,1287, 270,1286, 267,1284, 263,1283, 260,1282,
 776 257,1280, 254,1279, 251,1277, 248,1275, 245,1274, 242,1272, 239,1270, 236,1269,
 777 233,1267, 230,1265, 227,1263, 224,1261, 221,1259, 218,1257, 215,1255, 212,1253,
 778 210,1251, 207,1248, 204,1246, 201,1244, 199,1241, 196,1239, 193,1237, 191,1234,
 779 188,1232, 186,1229, 183,1227, 180,1224, 178,1221, 175,1219, 173,1216, 171,1213,
 780 168,1210, 166,1207, 163,1205, 161,1202, 159,1199, 156,1196, 154,1193, 152,1190,
 781 150,1186, 147,1183, 145,1180, 143,1177, 141,1174, 139,1170, 137,1167, 134,1164,
 782 132,1160, 130,1157, 128,1153, 126,1150, 124,1146, 122,1143, 120,1139, 118,1136,
 783 117,1132, 115,1128, 113,1125, 111,1121, 109,1117, 107,1113, 106,1109, 104,1106,
 784 102,1102, 100,1098,  99,1094,  97,1090,  95,1086,  94,1082,  92,1078,  90,1074,
 785  89,1070,  87,1066,  86,1061,  84,1057,  83,1053,  81,1049,  80,1045,  78,1040,
 786  77,1036,  76,1032,  74,1027,  73,1023,  71,1019,  70,1014,  69,1010,  67,1005,
 787  66,1001,  65, 997,  64, 992,  62, 988,  61, 983,  60, 978,  59, 974,  58, 969,
 788  56, 965,  55, 960,  54, 955,  53, 951,  52, 946,  51, 941,  50, 937,  49, 932,
 789  48, 927,  47, 923,  46, 918,  45, 913,  44, 908,  43, 904,  42, 899,  41, 894,
 790  40, 889,  39, 884,  38, 880,  37, 875,  36, 870,  36, 865,  35, 860,  34, 855,
 791  33, 851,  32, 846,  32, 841,  31, 836,  30, 831,  29, 826,  29, 821,  28, 816,
 792  27, 811,  27, 806,  26, 802,  25, 797,  24, 792,  24, 787,  23, 782,  23, 777,
 793  22, 772,  21, 767,  21, 762,  20, 757,  20, 752,  19, 747,  19, 742,  18, 737,
 794  17, 732,  17, 728,  16, 723,  16, 718,  15, 713,  15, 708,  15, 703,  14, 698,
 795  14, 693,  13, 688,  13, 683,  12, 678,  12, 674,  11, 669,  11, 664,  11, 659,
 796  10, 654,  10, 649,  10, 644,   9, 640,   9, 635,   9, 630,   8, 625,   8, 620,
 797   8, 615,   7, 611,   7, 606,   7, 601,   6, 596,   6, 592,   6, 587,   6, 582,
 798   5, 577,   5, 573,   5, 568,   5, 563,   4, 559,   4, 554,   4, 550,   4, 545,
 799   4, 540,   3, 536,   3, 531,   3, 527,   3, 522,   3, 517,   2, 513,   2, 508,
 800   2, 504,   2, 499,   2, 495,   2, 491,   2, 486,   1, 482,   1, 477,   1, 473,
 801   1, 469,   1, 464,   1, 460,   1, 456,   1, 451,   1, 447,   1, 443,   1, 439,
 802   0, 434,   0, 430,   0, 426,   0, 422,   0, 418,   0, 414,   0, 410,   0, 405,
 803   0, 401,   0, 397,   0, 393,   0, 389,   0, 385,   0, 381,   0, 378,   0, 374,
 804             };
 805             /* Gaussian interpolation using most recent 4 samples */
 806             long position = voice->position;
 807             voice->position += rate;
 808             short const* interp = voice->samples + (position >> 12);
 809             int offset = position >> 4 & 0xFF;
 810
 811             /* Only left half of gaussian kernel is in table, so we must mirror
 812                for right half */
 813             short const* fwd = gauss       + offset * 2;
 814             short const* rev = gauss + 510 - offset * 2;
 815
 816             /* Use faster gaussian interpolation when exact result isn't needed
 817                by pitch modulator of next channel */
 818             int amp_0, amp_1; /* Also serve as temps _0, and _1 */
 819             if ( LIKELY ( !(slow_gaussian & vbit) ) ) /* 99% of the time */
 820             {
 821                 /* Main optimization is lack of clamping. Not a problem since
 822                    output never goes more than +/- 16 outside 16-bit range and
 823                    things are clamped later anyway. Other optimization is to
 824                    preserve fractional accuracy, eliminating several masks. */
 825             #if defined (CPU_ARM)
 826                 int output;
 827                 int _2, _3; /* All-purpose temps */
 828                 /* Multiple ASM blocks keep regs free and reduce result
 829                  * latency issues. */
 830             #if ARM_ARCH >= 6
 831                 /* Interpolate */
 832                 asm volatile (
 833                 "ldr     %[_0], [%[interp]]           \r\n" /* _0=i0i1            */
 834                 "ldr     %[_2], [%[fwd]]              \r\n" /* _2=f0f1            */
 835                 "ldr     %[_1], [%[interp], #4]       \r\n" /* _1=i2i3            */
 836                 "ldr     %[_3], [%[rev]]              \r\n" /* _3=r0r1            */
 837                 "smuad   %[out], %[_0], %[_2]         \r\n" /* out=f0*i0 + f1*i1  */
 838                 "smladx  %[out], %[_1], %[_3], %[out] \r\n" /* out+=r1*i2 + r0*i3 */
 839                 : [out]"=r"(output),
 840                   [_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
 841                   [_2]"=&r"(_2), [_3]"=r"(_3)
 842                 : [fwd]"r"(fwd), [rev]"r"(rev),
 843                   [interp]"r"(interp));
 844                 /* Apply voice envelope */
 845                 asm volatile (
 846                 "mov     %[_2], %[out], asr #(11-5)   \r\n" /* To do >> 16 later */
 847                 "mul     %[out], %[_2], %[envx]       \r\n" /* and avoid exp. shift */
 848                 : [out]"+r"(output), [_2]"=&r"(_2)
 849                 : [envx]"r"((int)voice->envx));
 850                 /* Apply left and right volume */
 851                 asm volatile (
 852                 "smulwb  %[amp_0], %[out], %[vvol_0]  \r\n" /* (32x16->48)[47:16]->[31:0] */
 853                 "smulwb  %[amp_1], %[out], %[vvol_1]  \r\n"
 854                 : [out]"+r"(output),
 855                   [amp_0]"=&r"(amp_0), [amp_1]"=r"(amp_1)
 856                 : [vvol_0]"r"(voice->volume[0]),
 857                   [vvol_1]"r"(voice->volume[1]));
 858
 859                 raw_voice->outx = output >> (8+5); /* 'output' still 5 bits too big */
 860             #else /* ARM_ARCH < 6 */
 861                 /* Perform gaussian interpolation on four samples */
 862                 asm volatile (
 863                 "ldrsh   %[_0], [%[interp]]           \r\n"
 864                 "ldrsh   %[_2], [%[fwd]]              \r\n"
 865                 "ldrsh   %[_1], [%[interp], #2]       \r\n"
 866                 "ldrsh   %[_3], [%[fwd], #2]          \r\n"
 867                 "mul     %[out], %[_0], %[_2]         \r\n" /* out= fwd[0]*interp[0] */
 868                 "ldrsh   %[_0], [%[interp], #4]       \r\n"
 869                 "ldrsh   %[_2], [%[rev], #2]          \r\n"
 870                 "mla     %[out], %[_1], %[_3], %[out] \r\n" /* out+=fwd[1]*interp[1] */
 871                 "ldrsh   %[_1], [%[interp], #6]       \r\n"
 872                 "ldrsh   %[_3], [%[rev]]              \r\n"
 873                 "mla     %[out], %[_0], %[_2], %[out] \r\n" /* out+=rev[1]*interp[2] */
 874                 "mla     %[out], %[_1], %[_3], %[out] \r\n" /* out+=rev[0]*interp[3] */
 875                 : [out]"=&r"(output),
 876                   [_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
 877                   [_2]"=&r"(_2), [_3]"=&r"(_3)
 878                 : [fwd]"r"(fwd), [rev]"r"(rev),
 879                   [interp]"r"(interp));
 880                 /* Apply voice envelope */
 881                 asm volatile (
 882                 "mov     %[_2], %[out], asr #11       \r\n"
 883                 "mul     %[out], %[_2], %[envx]       \r\n"
 884                 : [out]"+r"(output), [_2]"=&r"(_2)
 885                 : [envx]"r"((int)voice->envx));
 886                 /* Reduce and apply left and right volume */
 887                 asm volatile (
 888                 "mov    %[out], %[out], asr #11       \r\n"
 889                 "mul    %[amp_0], %[out], %[vvol_0]   \r\n"
 890                 "mul    %[amp_1], %[out], %[vvol_1]   \r\n"
 891                 : [out]"+r"(output),
 892                   [amp_0]"=&r"(amp_0), [amp_1]"=r"(amp_1)
 893                 : [vvol_0]"r"((int)voice->volume[0]),
 894                   [vvol_1]"r"((int)voice->volume[1]));
 895
 896                 raw_voice->outx = output >> 8;
 897             #endif /* ARM_ARCH */
 898             #else /* Unoptimized CPU */
 899                 int output = (((fwd [0] * interp [0] +
 900                          fwd [1] * interp [1] +
 901                          rev [1] * interp [2] +
 902                          rev [0] * interp [3]    ) >> 11) * voice->envx) >> 11;
 903
 904                 /* duplicated here to give compiler more to run in parallel */
 905                 amp_0 = voice->volume [0] * output;
 906                 amp_1 = voice->volume [1] * output;
 907
 908                 raw_voice->outx = output >> 8;
 909             #endif /* CPU_* */
 910             }
 911             else /* slow gaussian */
 912             {
 913             #if defined(CPU_ARM)
 914             #if ARM_ARCH >= 6
 915                 int output = *(int16_t*) &this->noise;
 916
 917                 if ( !(this->r.g.noise_enables & vbit) )
 918                 {
 919                     /* Interpolate */
 920                     int _2, _3;
 921                     asm volatile (
 922                     /* NOTE: often-unaligned accesses */
 923                     "ldr     %[_0], [%[interp]]            \r\n" /* _0=i0i1   */
 924                     "ldr     %[_2], [%[fwd]]               \r\n" /* _2=f0f1   */
 925                     "ldr     %[_1], [%[interp], #4]        \r\n" /* _1=i2i3   */
 926                     "ldr     %[_3], [%[rev]]               \r\n" /* _3=f2f3   */
 927                     "smulbb  %[out], %[_0], %[_2]          \r\n" /* out=f0*i0 */
 928                     "smultt  %[_0],  %[_0], %[_2]          \r\n" /* _0=f1*i1  */
 929                     "smulbt  %[_2],  %[_1], %[_3]          \r\n" /* _2=r1*i2  */
 930                     "smultb  %[_3],  %[_1], %[_3]          \r\n" /* _3=r0*i3  */
 931                     : [out]"=r"(output),
 932                       [_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
 933                       [_2]"=&r"(_2), [_3]"=r"(_3)
 934                     : [fwd]"r"(fwd), [rev]"r"(rev),
 935                       [interp]"r"(interp));
 936                     asm volatile (
 937                     "mov     %[out], %[out], asr#12        \r\n"
 938                     "add     %[_0], %[out], %[_0], asr #12 \r\n"
 939                     "add     %[_2], %[_0], %[_2], asr #12  \r\n"
 940                     "pkhbt   %[_0], %[_2], %[_3], asl #4   \r\n" /* _3[31:16], _2[15:0] */
 941                     "sadd16  %[_0], %[_0], %[_0]           \r\n" /* _3[31:16]*2, _2[15:0]*2 */
 942                     "qsubaddx %[out], %[_0], %[_0]         \r\n" /* out[15:0]=
 943                                                                   * sat16(_3[31:16]+_2[15:0]) */
 944                     : [out]"+r"(output),
 945                       [_0]"+r"(amp_0), [_2]"+r"(_2), [_3]"+r"(_3));
 946                 }
 947                 /* Apply voice envelope */
 948                 asm volatile (
 949                 "smulbb  %[out], %[out], %[envx]       \r\n"
 950                 : [out]"+r"(output)
 951                 : [envx]"r"(voice->envx));
 952                 /* Reduce and apply left and right volume */
 953                 asm volatile (
 954                 "mov     %[out], %[out], asr #11       \r\n"
 955                 "bic     %[out], %[out], #0x1          \r\n"
 956                 "mul     %[amp_0], %[out], %[vvol_0]   \r\n"
 957                 "mul     %[amp_1], %[out], %[vvol_1]   \r\n"
 958                 : [out]"+r"(output),
 959                   [amp_0]"=&r"(amp_0), [amp_1]"=r"(amp_1)
 960                 : [vvol_0]"r"((int)voice->volume[0]),
 961                   [vvol_1]"r"((int)voice->volume[1]));
 962
 963                 prev_outx = output;
 964                 raw_voice->outx = output >> 8;
 965             #else /* ARM_ARCH < 6 */
 966                 int output = *(int16_t*) &this->noise;
 967
 968                 if ( !(this->r.g.noise_enables & vbit) )
 969                 {
 970                     /* Interpolate */
 971                     int _2, _3;
 972                     asm volatile (
 973                     "ldrsh   %[_0], [%[interp]]            \r\n"
 974                     "ldrsh   %[_2], [%[fwd]]               \r\n"
 975                     "ldrsh   %[_1], [%[interp], #2]        \r\n"
 976                     "ldrsh   %[_3], [%[fwd], #2]           \r\n"
 977                     "mul     %[out], %[_2], %[_0]          \r\n" /* fwd[0]*interp[0] */
 978                     "ldrsh   %[_2], [%[rev], #2]           \r\n"
 979                     "mul     %[_0], %[_3], %[_1]           \r\n" /* fwd[1]*interp[1] */
 980                     "ldrsh   %[_1], [%[interp], #4]        \r\n"
 981                     "mov     %[out], %[out], asr #12       \r\n"
 982                     "ldrsh   %[_3], [%[rev]]               \r\n"
 983                     "mul     %[_2], %[_1], %[_2]           \r\n" /* rev[1]*interp[2] */
 984                     "ldrsh   %[_1], [%[interp], #6]        \r\n"
 985                     "add     %[_0], %[out], %[_0], asr #12 \r\n"
 986                     "mul     %[_3], %[_1], %[_3]           \r\n" /* rev[0]*interp[3] */
 987                     "add     %[_2], %[_0], %[_2], asr #12  \r\n"
 988                     "mov     %[_2], %[_2], lsl #17         \r\n"
 989                     "mov     %[_3], %[_3], asr #12         \r\n"
 990                     "mov     %[_3], %[_3], asl #1          \r\n"
 991                     "add     %[out], %[_3], %[_2], asr #16 \r\n"
 992                     : [out]"=&r"(output),
 993                       [_0]"=&r"(amp_0), [_1]"=&r"(amp_1),
 994                       [_2]"=&r"(_2), [_3]"=&r"(_3)
 995                     : [fwd]"r"(fwd), [rev]"r"(rev),
 996                       [interp]"r"(interp));
 997
 998                     output = CLAMP16(output);
 999                 }
1000                 /* Apply voice envelope */
1001                 asm volatile (
1002                     "mul     %[_0], %[out], %[envx]        \r\n"
1003                     : [_0]"=r"(amp_0)
1004                     : [out]"r"(output), [envx]"r"((int)voice->envx));
1005                 /* Reduce and apply left and right volume */
1006                 asm volatile (
1007                     "mov     %[out], %[amp_0], asr #11     \r\n" /* amp_0 = _0 */
1008                     "bic     %[out], %[out], #0x1          \r\n"
1009                     "mul     %[amp_0], %[out], %[vvol_0]   \r\n"
1010                     "mul     %[amp_1], %[out], %[vvol_1]   \r\n"
1011                 : [out]"+r"(output),
1012                   [amp_0]"+r"(amp_0), [amp_1]"=r"(amp_1)
1013                 : [vvol_0]"r"((int)voice->volume[0]),
1014                   [vvol_1]"r"((int)voice->volume[1]));
1015
1016                 prev_outx = output;
1017                 raw_voice->outx = output >> 8;
1018             #endif /* ARM_ARCH >= 6 */
1019             #else /* Unoptimized CPU */
1020                 int output = *(int16_t*) &this->noise;
1021
1022                 if ( !(this->r.g.noise_enables & vbit) )
1023                 {
1024                     output = (fwd [0] * interp [0]) & ~0xFFF;
1025                     output = (output + fwd [1] * interp [1]) & ~0xFFF;
1026                     output = (output + rev [1] * interp [2]) >> 12;
1027                     output = (int16_t) (output * 2);
1028                     output += ((rev [0] * interp [3]) >> 12) * 2;
1029                     output = CLAMP16( output );
1030                 }
1031                 output = (output * voice->envx) >> 11 & ~1;
1032
1033                 /* duplicated here to give compiler more to run in parallel */
1034                 amp_0 = voice->volume [0] * output;
1035                 amp_1 = voice->volume [1] * output;
1036
1037                 prev_outx = output;
1038                 raw_voice->outx = output >> 8;
1039             #endif /* CPU_* */
1040             }
1041         #else /* SPCNOINTERP */
1042         /* two-point linear interpolation */
1043         #ifdef CPU_COLDFIRE
1044             int amp_0 = (int16_t)this->noise;
1045             int amp_1;
1046
1047             if ( (this->r.g.noise_enables & vbit) == 0 )
1048             {
1049                 uint32_t f = voice->position;
1050                 int32_t y0;
1051
1052                 /**
1053                  * Formula (fastest found so far of MANY):
1054                  * output = y0 + f*y1 - f*y0
1055                  */
1056                 asm volatile (
1057                 /* separate fractional and whole parts   */
1058                 "move.l     %[f], %[y1]               \r\n"
1059                 "and.l      #0xfff, %[f]              \r\n"
1060                 "lsr.l      %[sh], %[y1]              \r\n"
1061                 /* load samples y0 (upper) & y1 (lower)  */
1062                 "move.l     2(%[s], %[y1].l*2), %[y1] \r\n"
1063                 /* %acc0 = f*y1                          */
1064                 "mac.w      %[f]l, %[y1]l, %%acc0     \r\n"
1065                 /* %acc0 -= f*y0                         */
1066                 "msac.w     %[f]l, %[y1]u, %%acc0     \r\n"
1067                 /* separate out y0 and sign extend       */
1068                 "swap       %[y1]                     \r\n"
1069                 "movea.w    %[y1], %[y0]              \r\n"
1070                 /* fetch result, scale down and add y0   */
1071                 "movclr.l   %%acc0, %[y1]             \r\n"
1072                 /* output = y0 + (result >> 12)          */
1073                 "asr.l      %[sh], %[y1]              \r\n"
1074                 "add.l      %[y0], %[y1]              \r\n"
1075                 : [f]"+d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0)
1076                 : [s]"a"(voice->samples), [sh]"d"(12));
1077             }
1078
1079             /* apply voice envelope to output */
1080             asm volatile (
1081             "mac.w %[out]l, %[envx]l, %%acc0 \r\n"
1082             :
1083             : [out]"r"(amp_0), [envx]"r"(voice->envx));
1084
1085             /* advance voice position */
1086             voice->position += rate;
1087
1088             /* fetch output, scale and apply left and right
1089                voice volume */
1090             asm volatile (
1091             "movclr.l %%acc0,    %[out]         \r\n"
1092             "asr.l    %[sh],     %[out]         \r\n"
1093             "mac.l    %[vvol_0], %[out], %%acc0 \r\n"
1094             "mac.l    %[vvol_1], %[out], %%acc1 \r\n"
1095             : [out]"=&d"(amp_0)
1096             : [vvol_0]"r"((int)voice->volume[0]),
1097               [vvol_1]"r"((int)voice->volume[1]),
1098               [sh]"d"(11));
1099
1100             /* save this output into previous, scale and save in
1101                output register */
1102             prev_outx = amp_0;
1103             raw_voice->outx = amp_0 >> 8;
1104
1105             /* fetch final voice output */
1106             asm volatile (
1107             "movclr.l %%acc0, %[amp_0] \r\n"
1108             "movclr.l %%acc1, %[amp_1] \r\n"
1109             : [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1));
1110         #elif defined (CPU_ARM)
1111             int amp_0, amp_1;
1112
1113             if ( (this->r.g.noise_enables & vbit) != 0 )
1114             {
1115                 amp_0 = *(int16_t *)&this->noise;
1116             }
1117             else
1118             {
1119                 uint32_t f = voice->position;
1120                 amp_0 = (uint32_t)voice->samples;
1121
1122                 asm volatile(
1123                 "mov    %[y1], %[f], lsr #12        \r\n"
1124                 "eor    %[f], %[f], %[y1], lsl #12  \r\n"
1125                 "add    %[y1], %[y0], %[y1], lsl #1 \r\n"
1126                 "ldrsh  %[y0], [%[y1], #2]          \r\n"
1127                 "ldrsh  %[y1], [%[y1], #4]          \r\n"
1128                 "sub    %[y1], %[y1], %[y0]         \r\n"
1129                 "mul    %[f], %[y1], %[f]           \r\n"
1130                 "add    %[y0], %[y0], %[f], asr #12 \r\n"
1131                 : [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1));
1132             }
1133
1134             voice->position += rate;
1135
1136             asm volatile(
1137             "mul    %[amp_1], %[amp_0], %[envx] \r\n"
1138             "mov    %[amp_0], %[amp_1], asr #11 \r\n"
1139             "mov    %[amp_1], %[amp_0], asr #8  \r\n"
1140             : [amp_0]"+r"(amp_0), [amp_1]"=r"(amp_1)
1141             : [envx]"r"(voice->envx));
1142
1143             prev_outx = amp_0;
1144             raw_voice->outx = (int8_t)amp_1;
1145
1146             asm volatile(
1147             "mul    %[amp_1], %[amp_0], %[vol_1] \r\n"
1148             "mul    %[amp_0], %[vol_0], %[amp_0] \r\n"
1149             : [amp_0]"+r"(amp_0), [amp_1]"=&r"(amp_1)
1150             : [vol_0]"r"((int)voice->volume[0]),
1151               [vol_1]"r"((int)voice->volume[1]));
1152         #else /* Unoptimized CPU */
1153             int output;
1154
1155             if ( (this->r.g.noise_enables & vbit) == 0 )
1156             {
1157                 int const fraction = voice->position & 0xfff;
1158                 short const* const pos = (voice->samples + (voice->position >> 12)) + 1;
1159                 output = pos[0] + ((fraction * (pos[1] - pos[0])) >> 12);
1160             } else {
1161                 output = *(int16_t *)&this->noise;
1162             }
1163
1164             voice->position += rate;
1165
1166             output = (output * voice->envx) >> 11;
1167
1168             /* duplicated here to give compiler more to run in parallel */
1169             int amp_0 = voice->volume [0] * output;
1170             int amp_1 = voice->volume [1] * output;
1171
1172             prev_outx = output;
1173             raw_voice->outx = (int8_t) (output >> 8);
1174         #endif /* CPU_* */
1175         #endif /* SPCNOINTERP */
1176
1177         #if SPC_BRRCACHE
1178             if ( voice->position >= voice->wave_end )
1179             {
1180                 long loop_len = voice->wave_loop << 12;
1181                 voice->position -= loop_len;
1182                 this->r.g.wave_ended |= vbit;
1183                 if ( !loop_len )
1184                 {
1185                     this->keys_down ^= vbit;
1186                     raw_voice->envx = 0;
1187                     voice->envx = 0;
1188                 }
1189             }
1190         #endif
1191 #if 0
1192             EXIT_TIMER(dsp_gen);
1193
1194             ENTER_TIMER(dsp_mix);
1195 #endif
1196             chans_0 += amp_0;
1197             chans_1 += amp_1;
1198             #if !SPC_NOECHO
1199                 if ( this->r.g.echo_ons & vbit )
1200                 {
1201                     echo_0 += amp_0;
1202                     echo_1 += amp_1;
1203                 }
1204             #endif
1205 #if 0
1206             EXIT_TIMER(dsp_mix);
1207 #endif
1208         }
1209         /* end of voice loop */
1210
1211     #if !SPC_NOECHO
1212     #ifdef CPU_COLDFIRE
1213         /* Read feedback from echo buffer */
1214         int echo_pos = this->echo_pos;
1215         uint8_t* const echo_ptr = RAM + ((echo_start + echo_pos) & 0xFFFF);
1216         echo_pos += 4;
1217         if ( echo_pos >= echo_wrap )
1218             echo_pos = 0;
1219         this->echo_pos = echo_pos;
1220         int fb = swap_odd_even32(*(int32_t *)echo_ptr);
1221         int out_0, out_1;
1222
1223         /* Keep last 8 samples */
1224         *this->last_fir_ptr = fb;
1225         this->last_fir_ptr  = this->fir_ptr;
1226
1227         /* Apply echo FIR filter to output samples read from echo buffer -
1228            circular buffer is hardware incremented and masked; FIR
1229            coefficients and buffer history are loaded in parallel with
1230            multiply accumulate operations. Shift left by one here and once
1231            again when calculating feedback to have sample values justified
1232            to bit 31 in the output to ease endian swap, interleaving and
1233            clamping before placing result in the program's echo buffer. */
1234         int _0, _1, _2;
1235         asm volatile (
1236         "move.l                           (%[fir_c])  , %[_2]         \r\n"
1237         "mac.w      %[fb]u, %[_2]u, <<,   (%[fir_p])+&, %[_0], %%acc0 \r\n"
1238         "mac.w      %[fb]l, %[_2]u, <<,   (%[fir_p])& , %[_1], %%acc1 \r\n"
1239         "mac.w      %[_0]u, %[_2]l, <<                       , %%acc0 \r\n"
1240         "mac.w      %[_0]l, %[_2]l, <<,  4(%[fir_c])  , %[_2], %%acc1 \r\n"
1241         "mac.w      %[_1]u, %[_2]u, <<,  4(%[fir_p])& , %[_0], %%acc0 \r\n"
1242         "mac.w      %[_1]l, %[_2]u, <<,  8(%[fir_p])& , %[_1], %%acc1 \r\n"
1243         "mac.w      %[_0]u, %[_2]l, <<                       , %%acc0 \r\n"
1244         "mac.w      %[_0]l, %[_2]l, <<,  8(%[fir_c])  , %[_2], %%acc1 \r\n"
1245         "mac.w      %[_1]u, %[_2]u, <<, 12(%[fir_p])& , %[_0], %%acc0 \r\n"
1246         "mac.w      %[_1]l, %[_2]u, <<, 16(%[fir_p])& , %[_1], %%acc1 \r\n"
1247         "mac.w      %[_0]u, %[_2]l, <<                       , %%acc0 \r\n"
1248         "mac.w      %[_0]l, %[_2]l, <<, 12(%[fir_c])  , %[_2], %%acc1 \r\n"
1249         "mac.w      %[_1]u, %[_2]u, <<, 20(%[fir_p])& , %[_0], %%acc0 \r\n"
1250         "mac.w      %[_1]l, %[_2]u, <<                       , %%acc1 \r\n"
1251         "mac.w      %[_0]u, %[_2]l, <<                       , %%acc0 \r\n"
1252         "mac.w      %[_0]l, %[_2]l, <<                       , %%acc1 \r\n"
1253         : [_0]"=&r"(_0), [_1]"=&r"(_1), [_2]"=&r"(_2),
1254           [fir_p]"+a"(this->fir_ptr)
1255         : [fir_c]"a"(this->fir_coeff), [fb]"r"(fb)
1256         );
1257
1258         /* Generate output */
1259         asm volatile (
1260         /* fetch filter results _after_ gcc loads asm
1261            block parameters to eliminate emac stalls   */
1262         "movclr.l   %%acc0, %[out_0]                \r\n"
1263         "movclr.l   %%acc1, %[out_1]                \r\n"
1264         /* apply global volume                         */
1265         "mac.l      %[chans_0], %[gv_0]    , %%acc2 \r\n"
1266         "mac.l      %[chans_1], %[gv_1]    , %%acc3 \r\n"
1267         /* apply echo volume and add to final output   */
1268         "mac.l      %[ev_0],   %[out_0], >>, %%acc2 \r\n"
1269         "mac.l      %[ev_1],   %[out_1], >>, %%acc3 \r\n"
1270         : [out_0]"=&r"(out_0), [out_1]"=&r"(out_1)
1271         : [chans_0]"r"(chans_0), [gv_0]"r"(global_vol_0),
1272           [ev_0]"r"((int)this->r.g.echo_volume_0),
1273           [chans_1]"r"(chans_1), [gv_1]"r"(global_vol_1),
1274           [ev_1]"r"((int)this->r.g.echo_volume_1)
1275         );
1276
1277         /* Feedback into echo buffer */
1278         if ( !(this->r.g.flags & 0x20) )
1279         {
1280             asm volatile (
1281             /* scale echo voices; saturate if overflow */
1282             "mac.l      %[sh], %[e1]       , %%acc1 \r\n"
1283             "mac.l      %[sh], %[e0]       , %%acc0 \r\n"
1284             /* add scaled output from FIR filter       */
1285             "mac.l      %[out_1], %[ef], <<, %%acc1 \r\n"
1286             "mac.l      %[out_0], %[ef], <<, %%acc0 \r\n"
1287             /* swap and fetch feedback results - simply
1288                swap_odd_even32 mixed in between macs and
1289                movclrs to mitigate stall issues        */
1290             "move.l     #0x00ff00ff, %[sh]          \r\n"
1291             "movclr.l   %%acc1, %[e1]               \r\n"
1292             "swap       %[e1]                       \r\n"
1293             "movclr.l   %%acc0, %[e0]               \r\n"
1294             "move.w     %[e1], %[e0]                \r\n"
1295             "and.l      %[e0], %[sh]                \r\n"
1296             "eor.l      %[sh], %[e0]                \r\n"
1297             "lsl.l      #8, %[sh]                   \r\n"
1298             "lsr.l      #8, %[e0]                   \r\n"
1299             "or.l       %[sh], %[e0]                \r\n"
1300             /* save final feedback into echo buffer    */
1301             "move.l     %[e0], (%[echo_ptr])        \r\n"
1302             : [e0]"+d"(echo_0), [e1]"+d"(echo_1)
1303             : [out_0]"r"(out_0), [out_1]"r"(out_1),
1304               [ef]"r"((int)this->r.g.echo_feedback),
1305               [echo_ptr]"a"((int32_t *)echo_ptr),
1306               [sh]"d"(1 << 9)
1307             );
1308         }
1309
1310         /* Output final samples */
1311         asm volatile (
1312         /* fetch output saved in %acc2 and %acc3 */
1313         "movclr.l   %%acc2, %[out_0] \r\n"
1314         "movclr.l   %%acc3, %[out_1] \r\n"
1315         /* scale right by global_muting shift    */
1316         "asr.l      %[gm],  %[out_0] \r\n"
1317         "asr.l      %[gm],  %[out_1] \r\n"
1318         : [out_0]"=&d"(out_0), [out_1]"=&d"(out_1)
1319         : [gm]"d"(global_muting)
1320         );
1321
1322         out_buf [             0] = out_0;
1323         out_buf [WAV_CHUNK_SIZE] = out_1;
1324         out_buf ++;
1325     #elif defined (CPU_ARM)
1326         /* Read feedback from echo buffer */
1327         int echo_pos = this->echo_pos;
1328         uint8_t* const echo_ptr = RAM +
1329                 ((this->r.g.echo_page * 0x100 + echo_pos) & 0xFFFF);
1330         echo_pos += 4;
1331         if ( echo_pos >= (this->r.g.echo_delay & 15) * 0x800 )
1332             echo_pos = 0;
1333         this->echo_pos = echo_pos;
1334
1335     #if ARM_ARCH >= 6
1336         int32_t *fir_ptr, *fir_coeff;
1337         int fb_0, fb_1;
1338
1339         /* Apply FIR */
1340         fb_0 = *(uint32_t *)echo_ptr;
1341
1342         /* Keep last 8 samples */
1343         asm volatile (
1344         "add    %[fir_p], %[t_fir_p], #4      \r\n"
1345         "bic    %[t_fir_p], %[fir_p], %[mask] \r\n"
1346         "str    %[fb_0], [%[fir_p], #-4]      \r\n"
1347         /* duplicate at +8 eliminates wrap checking below */
1348         "str    %[fb_0], [%[fir_p], #28]      \r\n"
1349         : [fir_p]"=&r"(fir_ptr), [t_fir_p]"+r"(this->fir_ptr)
1350         : [fb_0]"r"(fb_0), [mask]"i"(~FIR_BUF_MASK));
1351
1352         fir_coeff = (int32_t *)this->fir_coeff;
1353
1354         /* Fugly, but the best version found. */
1355         int _0;
1356         asm volatile (                             /* L0R0 = acc0          */
1357         "ldmia   %[fir_p]!, { r2-r5 }        \r\n" /* L1R1-L4R4 = r2-r5    */
1358         "ldmia   %[fir_c]!, { r0-r1 }        \r\n" /* C0C1-C2C3 = r0-r1    */
1359         "pkhbt   %[_0], %[acc0], r2, asl #16 \r\n" /* L0R0,L1R1->L0L1,R0R1 */
1360         "pkhtb   r2, r2, %[acc0], asr #16    \r\n"
1361         "smuad   %[acc0], %[_0], r0          \r\n" /* acc0=L0*C0+L1*C1     */
1362         "smuad   %[acc1], r2, r0             \r\n" /* acc1=R0*C0+R1*C1     */
1363         "pkhbt   %[_0], r3, r4, asl #16      \r\n" /* L2R2,L3R3->L2L3,R2R3 */
1364         "pkhtb   r4, r4, r3, asr #16         \r\n"
1365         "smlad   %[acc0], %[_0], r1, %[acc0] \r\n" /* acc0+=L2*C2+L3*C3    */
1366         "smlad   %[acc1], r4, r1, %[acc1]    \r\n" /* acc1+=R2*C2+R3*C3    */
1367         "ldmia   %[fir_p], { r2-r4 }         \r\n" /* L5R5-L7R7 = r2-r4    */
1368         "ldmia   %[fir_c], { r0-r1 }         \r\n" /* C4C5-C6C7 = r0-r1    */
1369         "pkhbt   %[_0], r5, r2, asl #16      \r\n" /* L4R4,L5R5->L4L5,R4R5 */
1370         "pkhtb   r2, r2, r5, asr #16         \r\n"
1371         "smlad   %[acc0], %[_0], r0, %[acc0] \r\n" /* acc0+=L4*C4+L5*C5    */
1372         "smlad   %[acc1], r2, r0, %[acc1]    \r\n" /* acc1+=R4*C4+R5*C5    */
1373         "pkhbt   %[_0], r3, r4, asl #16      \r\n" /* L6R6,L7R7->L6L7,R6R7 */
1374         "pkhtb   r4, r4, r3, asr #16         \r\n"
1375         "smlad   %[acc0], %[_0], r1, %[acc0] \r\n" /* acc0+=L6*C6+L7*C7    */
1376         "smlad   %[acc1], r4, r1, %[acc1]    \r\n" /* acc1+=R6*C6+R7*C7    */
1377         : [acc0]"+r"(fb_0), [acc1]"=&r"(fb_1), [_0]"=&r"(_0),
1378           [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff)
1379         :
1380         : "r0", "r1", "r2", "r3", "r4", "r5");
1381
1382         /* Generate output */
1383         int amp_0, amp_1;
1384
1385         asm volatile (
1386         "mul     %[amp_0], %[gvol_0], %[chans_0] \r\n"
1387         "mul     %[amp_1], %[gvol_1], %[chans_1] \r\n"
1388         : [amp_0]"=&r"(amp_0), [amp_1]"=r"(amp_1)
1389         : [gvol_0]"r"(global_vol_0), [gvol_1]"r"(global_vol_1),
1390           [chans_0]"r"(chans_0), [chans_1]"r"(chans_1));
1391         asm volatile (
1392         "mla     %[amp_0], %[fb_0], %[ev_0], %[amp_0] \r\n"
1393         "mla     %[amp_1], %[fb_1], %[ev_1], %[amp_1] \r\n"
1394         : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1)
1395         : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1),
1396           [ev_0]"r"((int)this->r.g.echo_volume_0),
1397           [ev_1]"r"((int)this->r.g.echo_volume_1));
1398
1399         out_buf [             0] = amp_0 >> global_muting;
1400         out_buf [WAV_CHUNK_SIZE] = amp_1 >> global_muting;
1401         out_buf ++;
1402
1403         if ( !(this->r.g.flags & 0x20) )
1404         {
1405             /* Feedback into echo buffer */
1406             int e0, e1;
1407
1408             asm volatile (
1409             "mov     %[e0], %[echo_0], asl #7      \r\n"
1410             "mov     %[e1], %[echo_1], asl #7      \r\n"
1411             "mla     %[e0], %[fb_0], %[efb], %[e0] \r\n"
1412             "mla     %[e1], %[fb_1], %[efb], %[e1] \r\n"
1413             : [e0]"=&r"(e0), [e1]"=&r"(e1)
1414             : [echo_0]"r"(echo_0), [echo_1]"r"(echo_1),
1415               [fb_0]"r"(fb_0), [fb_1]"r"(fb_1),
1416               [efb]"r"((int)this->r.g.echo_feedback));
1417             asm volatile (
1418             "ssat    %[e0], #16, %[e0], asr #14    \r\n"
1419             "ssat    %[e1], #16, %[e1], asr #14    \r\n"
1420             "pkhbt   %[e0], %[e0], %[e1], lsl #16  \r\n"
1421             "str     %[e0], [%[echo_p]]            \r\n"
1422             : [e0]"+r"(e0), [e1]"+r"(e1)
1423             : [echo_p]"r"(echo_ptr));
1424         }
1425     #else /* ARM_ARCH < 6 */
1426         int fb_0 = GET_LE16SA( echo_ptr     );
1427         int fb_1 = GET_LE16SA( echo_ptr + 2 );
1428         int32_t *fir_ptr, *fir_coeff;
1429
1430         /* Keep last 8 samples */
1431
1432         /* Apply FIR */
1433         asm volatile (
1434         "add    %[fir_p], %[t_fir_p], #8      \r\n"
1435         "bic    %[t_fir_p], %[fir_p], %[mask] \r\n"
1436         "str    %[fb_0], [%[fir_p], #-8]      \r\n"
1437         "str    %[fb_1], [%[fir_p], #-4]      \r\n"
1438         /* duplicate at +8 eliminates wrap checking below */
1439         "str    %[fb_0], [%[fir_p], #56]      \r\n"
1440         "str    %[fb_1], [%[fir_p], #60]      \r\n"
1441         : [fir_p]"=&r"(fir_ptr), [t_fir_p]"+r"(this->fir_ptr)
1442         : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1), [mask]"i"(~FIR_BUF_MASK));
1443
1444         fir_coeff = this->fir_coeff;
1445
1446         asm volatile (
1447         "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
1448         "ldmia  %[fir_p]!, { r4-r5 }     \r\n"
1449         "mul    %[fb_0],     r0, %[fb_0] \r\n"
1450         "mul    %[fb_1],     r0, %[fb_1] \r\n"
1451         "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
1452         "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
1453         "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
1454         "ldmia  %[fir_p]!, { r2-r5 }     \r\n"
1455         "mla    %[fb_0], r2, r0, %[fb_0] \r\n"
1456         "mla    %[fb_1], r3, r0, %[fb_1] \r\n"
1457         "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
1458         "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
1459         "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
1460         "ldmia  %[fir_p]!, { r2-r5 }     \r\n"
1461         "mla    %[fb_0], r2, r0, %[fb_0] \r\n"
1462         "mla    %[fb_1], r3, r0, %[fb_1] \r\n"
1463         "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
1464         "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
1465         "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
1466         "ldmia  %[fir_p]!, { r2-r5 }     \r\n"
1467         "mla    %[fb_0], r2, r0, %[fb_0] \r\n"
1468         "mla    %[fb_1], r3, r0, %[fb_1] \r\n"
1469         "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
1470         "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
1471         : [fb_0]"+r"(fb_0), [fb_1]"+r"(fb_1),
1472           [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff)
1473         :
1474         : "r0", "r1", "r2", "r3", "r4", "r5");
1475
1476         /* Generate output */
1477         int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0)
1478                     >> global_muting;
1479         int amp_1 = (chans_1 * global_vol_1 + fb_1 * this->r.g.echo_volume_1)
1480                     >> global_muting;
1481
1482         out_buf [             0] = amp_0;
1483         out_buf [WAV_CHUNK_SIZE] = amp_1;
1484         out_buf ++;
1485
1486         if ( !(this->r.g.flags & 0x20) )
1487         {
1488             /* Feedback into echo buffer */
1489             int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
1490             int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
1491             e0 = CLAMP16( e0 );
1492             SET_LE16A( echo_ptr    , e0 );
1493             e1 = CLAMP16( e1 );
1494             SET_LE16A( echo_ptr + 2, e1 );
1495         }
1496     #endif /* ARM_ARCH */
1497     #else /* Unoptimized CPU */
1498         /* Read feedback from echo buffer */
1499         int echo_pos = this->echo_pos;
1500         uint8_t* const echo_ptr = RAM +
1501                 ((this->r.g.echo_page * 0x100 + echo_pos) & 0xFFFF);
1502         echo_pos += 4;
1503         if ( echo_pos >= (this->r.g.echo_delay & 15) * 0x800 )
1504             echo_pos = 0;
1505         this->echo_pos = echo_pos;
1506         int fb_0 = GET_LE16SA( echo_ptr     );
1507         int fb_1 = GET_LE16SA( echo_ptr + 2 );
1508
1509         /* Keep last 8 samples */
1510         int (* const fir_ptr) [2] = this->fir_buf + this->fir_pos;
1511         this->fir_pos = (this->fir_pos + 1) & (FIR_BUF_HALF - 1);
1512         fir_ptr [           0] [0] = fb_0;
1513         fir_ptr [           0] [1] = fb_1;
1514         /* duplicate at +8 eliminates wrap checking below */
1515         fir_ptr [FIR_BUF_HALF] [0] = fb_0;
1516         fir_ptr [FIR_BUF_HALF] [1] = fb_1;
1517
1518         /* Apply FIR */
1519         fb_0 *= this->fir_coeff [0];
1520         fb_1 *= this->fir_coeff [0];
1521
1522         #define DO_PT( i )\
1523             fb_0 += fir_ptr [i] [0] * this->fir_coeff [i];\
1524             fb_1 += fir_ptr [i] [1] * this->fir_coeff [i];
1525
1526         DO_PT( 1 )
1527         DO_PT( 2 )
1528         DO_PT( 3 )
1529         DO_PT( 4 )
1530         DO_PT( 5 )
1531         DO_PT( 6 )
1532         DO_PT( 7 )
1533
1534         /* Generate output */
1535         int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0)
1536                     >> global_muting;
1537         int amp_1 = (chans_1 * global_vol_1 + fb_1 * this->r.g.echo_volume_1)
1538                     >> global_muting;
1539         out_buf [             0] = amp_0;
1540         out_buf [WAV_CHUNK_SIZE] = amp_1;
1541         out_buf ++;
1542
1543         if ( !(this->r.g.flags & 0x20) )
1544         {
1545             /* Feedback into echo buffer */
1546             int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
1547             int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
1548             e0 = CLAMP16( e0 );
1549             SET_LE16A( echo_ptr    , e0 );
1550             e1 = CLAMP16( e1 );
1551             SET_LE16A( echo_ptr + 2, e1 );
1552         }
1553     #endif /* CPU_* */
1554     #else /* SPCNOECHO == 1*/
1555         /* Generate output  */
1556         int amp_0 = (chans_0 * global_vol_0) >> global_muting;
1557         int amp_1 = (chans_1 * global_vol_1) >> global_muting;
1558         out_buf [             0] = amp_0;
1559         out_buf [WAV_CHUNK_SIZE] = amp_1;
1560         out_buf ++;
1561     #endif /* SPCNOECHO */
1562     }
1563     while ( --count );
1564 #if 0
1565     EXIT_TIMER(dsp);
1566     ENTER_TIMER(cpu);
1567 #endif
1568 }
1569
1570 void DSP_reset( struct Spc_Dsp* this )
1571 {
1572     this->keys_down   = 0;
1573     this->echo_pos    = 0;
1574     this->noise_count = 0;
1575     this->noise       = 2;
1576
1577     this->r.g.flags   = 0xE0; /* reset, mute, echo off */
1578     this->r.g.key_ons = 0;
1579
1580     ci->memset( this->voice_state, 0, sizeof this->voice_state );
1581
1582     int i;
1583     for ( i = VOICE_COUNT; --i >= 0; )
1584     {
1585         struct voice_t* v = this->voice_state + i;
1586         v->env_mode = state_release;
1587         v->addr     = ram.ram;
1588     }
1589
1590     #if SPC_BRRCACHE
1591         this->oldsize = 0;
1592         for ( i = 0; i < 256; i++ )
1593             this->wave_entry [i].start_addr = -1;
1594     #endif
1595
1596 #if defined(CPU_COLDFIRE)
1597     this->fir_ptr = fir_buf;
1598     this->last_fir_ptr = &fir_buf [7];
1599     ci->memset( fir_buf, 0, sizeof fir_buf );
1600 #elif defined (CPU_ARM)
1601     this->fir_ptr = fir_buf;
1602     ci->memset( fir_buf, 0, sizeof fir_buf );
1603 #else
1604     this->fir_pos = 0;
1605     ci->memset( this->fir_buf, 0, sizeof this->fir_buf );
1606 #endif
1607
1608     assert( offsetof (struct globals_t,unused9 [2]) == REGISTER_COUNT );
1609     assert( sizeof (this->r.voice) == REGISTER_COUNT );
1610 }