common/memops.c

   1 /*
   2     Copyright (C) 2000 Paul Davis
   3
   4     This program is free software; you can redistribute it and/or modify
   5     it under the terms of the GNU General Public License as published by
   6     the Free Software Foundation; either version 2 of the License, or
   7     (at your option) any later version.
   8
   9     This program is distributed in the hope that it will be useful,
  10     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12     GNU General Public License for more details.
  13
  14     You should have received a copy of the GNU General Public License
  15     along with this program; if not, write to the Free Software
  16     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17
  18 */
  19
  20 #define _ISOC9X_SOURCE  1
  21 #define _ISOC99_SOURCE  1
  22
  23 #define __USE_ISOC9X    1
  24 #define __USE_ISOC99    1
  25
  26 #include <stdio.h>
  27 #include <string.h>
  28 #include <math.h>
  29 #include <memory.h>
  30 #include <stdlib.h>
  31 #include <stdint.h>
  32 #include <limits.h>
  33 #ifdef __linux__
  34 #include <endian.h>
  35 #endif
  36 #include "memops.h"
  37
  38 #if defined (__SSE2__) && !defined (__sun__)
  39 #include <emmintrin.h>
  40 #ifdef __SSE4_1__
  41 #include <smmintrin.h>
  42 #endif
  43 #endif
  44
  45 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
  46 #include <arm_neon.h>
  47 #endif
  48
  49 /* Notes about these *_SCALING values.
  50
  51    the MAX_<N>BIT values are floating point. when multiplied by
  52    a full-scale normalized floating point sample value (-1.0..+1.0)
  53    they should give the maximum value representable with an integer
  54    sample type of N bits. Note that this is asymmetric. Sample ranges
  55    for signed integer, 2's complement values are -(2^(N-1) to +(2^(N-1)-1)
  56
  57    Complications
  58    -------------
  59    If we use +2^(N-1) for the scaling factors, we run into a problem:
  60
  61    if we start with a normalized float value of -1.0, scaling
  62    to 24 bits would give -8388608 (-2^23), which is ideal.
  63    But with +1.0, we get +8388608, which is technically out of range.
  64
  65    We never multiply a full range normalized value by this constant,
  66    but we could multiply it by a positive value that is close enough to +1.0
  67    to produce a value > +(2^(N-1)-1.
  68
  69    There is no way around this paradox without wasting CPU cycles to determine
  70    which scaling factor to use (i.e. determine if its negative or not,
  71    use the right factor).
  72
  73    So, for now (October 2008) we use 2^(N-1)-1 as the scaling factor.
  74 */
  75
  76 #define SAMPLE_24BIT_SCALING  8388607.0f
  77 #define SAMPLE_16BIT_SCALING  32767.0f
  78
  79 /* these are just values to use if the floating point value was out of range
  80
  81    advice from Fons Adriaensen: make the limits symmetrical
  82  */
  83
  84 #define SAMPLE_24BIT_MAX  8388607
  85 #define SAMPLE_24BIT_MIN  -8388607
  86 #define SAMPLE_24BIT_MAX_F  8388607.0f
  87 #define SAMPLE_24BIT_MIN_F  -8388607.0f
  88
  89 #define SAMPLE_16BIT_MAX  32767
  90 #define SAMPLE_16BIT_MIN  -32767
  91 #define SAMPLE_16BIT_MAX_F  32767.0f
  92 #define SAMPLE_16BIT_MIN_F  -32767.0f
  93
  94 /* these mark the outer edges of the range considered "within" range
  95    for a floating point sample value. values outside (and on the boundaries)
  96    of this range will be clipped before conversion; values within this
  97    range will be scaled to appropriate values for the target sample
  98    type.
  99 */
 100
 101 #define NORMALIZED_FLOAT_MIN -1.0f
 102 #define NORMALIZED_FLOAT_MAX  1.0f
 103
 104 /* define this in case we end up on a platform that is missing
 105    the real lrintf functions
 106 */
 107
 108 #define f_round(f) lrintf(f)
 109
 110 #define float_16(s, d)\
 111         if ((s) <= NORMALIZED_FLOAT_MIN) {\
 112                 (d) = SAMPLE_16BIT_MIN;\
 113         } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
 114                 (d) = SAMPLE_16BIT_MAX;\
 115         } else {\
 116                 (d) = f_round ((s) * SAMPLE_16BIT_SCALING);\
 117         }
 118
 119 /* call this when "s" has already been scaled (e.g. when dithering)
 120  */
 121
 122 #define float_16_scaled(s, d)\
 123         if ((s) <= SAMPLE_16BIT_MIN_F) {\
 124                 (d) = SAMPLE_16BIT_MIN_F;\
 125         } else if ((s) >= SAMPLE_16BIT_MAX_F) { \
 126                 (d) = SAMPLE_16BIT_MAX;\
 127         } else {\
 128                 (d) = f_round ((s));\
 129         }
 130
 131 #define float_24u32(s, d) \
 132         if ((s) <= NORMALIZED_FLOAT_MIN) {\
 133                 (d) = SAMPLE_24BIT_MIN << 8;\
 134         } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
 135                 (d) = SAMPLE_24BIT_MAX << 8;\
 136         } else {\
 137                 (d) = f_round ((s) * SAMPLE_24BIT_SCALING) << 8;\
 138         }
 139
 140 /* call this when "s" has already been scaled (e.g. when dithering)
 141  */
 142
 143 #define float_24u32_scaled(s, d)\
 144         if ((s) <= SAMPLE_24BIT_MIN_F) {\
 145                 (d) = SAMPLE_24BIT_MIN << 8;\
 146         } else if ((s) >= SAMPLE_24BIT_MAX_F) { \
 147                 (d) = SAMPLE_24BIT_MAX << 8;            \
 148         } else {\
 149                 (d) = f_round ((s)) << 8; \
 150         }
 151
 152 #define float_24(s, d) \
 153         if ((s) <= NORMALIZED_FLOAT_MIN) {\
 154                 (d) = SAMPLE_24BIT_MIN;\
 155         } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
 156                 (d) = SAMPLE_24BIT_MAX;\
 157         } else {\
 158                 (d) = f_round ((s) * SAMPLE_24BIT_SCALING);\
 159         }
 160
 161 /* call this when "s" has already been scaled (e.g. when dithering)
 162  */
 163
 164 #define float_24_scaled(s, d)\
 165         if ((s) <= SAMPLE_24BIT_MIN_F) {\
 166                 (d) = SAMPLE_24BIT_MIN;\
 167         } else if ((s) >= SAMPLE_24BIT_MAX_F) { \
 168                 (d) = SAMPLE_24BIT_MAX;         \
 169         } else {\
 170                 (d) = f_round ((s)); \
 171         }
 172
 173
 174 #if defined (__SSE2__) && !defined (__sun__)
 175
 176 /* generates same as _mm_set_ps(1.f, 1.f, 1f., 1f) but faster  */
 177 static inline __m128 gen_one(void)
 178 {
 179     volatile __m128i x = { 0 }; /* shut up, GCC */
 180     __m128i ones = _mm_cmpeq_epi32(x, x);
 181     return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 25), 23);
 182 }
 183
 184 static inline __m128 clip(__m128 s, __m128 min, __m128 max)
 185 {
 186     return _mm_min_ps(max, _mm_max_ps(s, min));
 187 }
 188
 189 static inline __m128i float_24_sse(__m128 s)
 190 {
 191     const __m128 upper_bound = gen_one(); /* NORMALIZED_FLOAT_MAX */
 192     const __m128 lower_bound = _mm_sub_ps(_mm_setzero_ps(), upper_bound);
 193
 194     __m128 clipped = clip(s, lower_bound, upper_bound);
 195     __m128 scaled = _mm_mul_ps(clipped, _mm_set1_ps(SAMPLE_24BIT_SCALING));
 196     return _mm_cvtps_epi32(scaled);
 197 }
 198 #endif
 199
 200
 201 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
 202
 203 static inline float32x4_t clip(float32x4_t s, float32x4_t min, float32x4_t max)
 204 {
 205         return vminq_f32(max, vmaxq_f32(s, min));
 206 }
 207
 208 static inline int32x4_t float_24_neon(float32x4_t s)
 209 {
 210         const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX);
 211         const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN);
 212
 213         float32x4_t clipped = clip(s, lower_bound, upper_bound);
 214         float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_24BIT_SCALING));
 215         return vcvtq_s32_f32(scaled);
 216 }
 217
 218 static inline int16x4_t float_16_neon(float32x4_t s)
 219 {
 220         const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX);
 221         const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN);
 222
 223         float32x4_t clipped = clip(s, lower_bound, upper_bound);
 224         float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_16BIT_SCALING));
 225         return vmovn_s32(vcvtq_s32_f32(scaled));
 226 }
 227 #endif
 228
 229 /* Linear Congruential noise generator. From the music-dsp list
 230  * less random than rand(), but good enough and 10x faster
 231  */
 232 static unsigned int seed = 22222;
 233
 234 static inline unsigned int fast_rand() {
 235         seed = (seed * 196314165) + 907633515;
 236         return seed;
 237 }
 238
 239 /* functions for native float sample data */
 240
 241 void sample_move_floatLE_sSs (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip) {
 242         while (nsamples--) {
 243                 *dst = *((float *) src);
 244                 dst++;
 245                 src += src_skip;
 246         }
 247 }
 248
 249 void sample_move_dS_floatLE (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) {
 250         while (nsamples--) {
 251                 *((float *) dst) = *src;
 252                 dst += dst_skip;
 253                 src++;
 254         }
 255 }
 256
 257 /* NOTES on function naming:
 258
 259    foo_bar_d<TYPE>_s<TYPE>
 260
 261    the "d<TYPE>" component defines the destination type for the operation
 262    the "s<TYPE>" component defines the source type for the operation
 263
 264    TYPE can be one of:
 265
 266    S      - sample is a jack_default_audio_sample_t, currently (October 2008) a 32 bit floating point value
 267    Ss     - like S but reverse endian from the host CPU
 268    32u24  - sample is an signed 32 bit integer value, but data is in upper 24 bits only
 269    32u24s - like 32u24 but reverse endian from the host CPU
 270    24     - sample is an signed 24 bit integer value
 271    24s    - like 24 but reverse endian from the host CPU
 272    16     - sample is an signed 16 bit integer value
 273    16s    - like 16 but reverse endian from the host CPU
 274
 275    For obvious reasons, the reverse endian versions only show as source types.
 276
 277    This covers all known sample formats at 16 bits or larger.
 278 */
 279
 280 /* functions for native integer sample data */
 281
 282 void sample_move_d32u24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 283 {
 284 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
 285         unsigned long unrolled = nsamples / 4;
 286         nsamples = nsamples & 3;
 287
 288         while (unrolled--) {
 289                 float32x4_t samples = vld1q_f32(src);
 290                 int32x4_t converted = float_24_neon(samples);
 291                 int32x4_t shifted = vshlq_n_s32(converted, 8);
 292                 shifted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(shifted)));
 293
 294                 switch(dst_skip) {
 295                         case 4:
 296                                 vst1q_s32((int32_t*)dst, shifted);
 297                                 break;
 298                         default:
 299                                 vst1q_lane_s32((int32_t*)(dst),            shifted, 0);
 300                                 vst1q_lane_s32((int32_t*)(dst+dst_skip),   shifted, 1);
 301                                 vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2);
 302                 vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3);
 303                                 break;
 304                 }
 305                 dst += 4*dst_skip;
 306                 src+= 4;
 307         }
 308 #endif
 309
 310         int32_t z;
 311
 312         while (nsamples--) {
 313
 314                 float_24u32 (*src, z);
 315
 316 #if __BYTE_ORDER == __LITTLE_ENDIAN
 317                 dst[0]=(char)(z>>24);
 318                 dst[1]=(char)(z>>16);
 319                 dst[2]=(char)(z>>8);
 320                 dst[3]=(char)(z);
 321 #elif __BYTE_ORDER == __BIG_ENDIAN
 322                 dst[0]=(char)(z);
 323                 dst[1]=(char)(z>>8);
 324                 dst[2]=(char)(z>>16);
 325                 dst[3]=(char)(z>>24);
 326 #endif
 327                 dst += dst_skip;
 328                 src++;
 329         }
 330 }
 331
 332 void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 333 {
 334 #if defined (__SSE2__) && !defined (__sun__)
 335         __m128 int_max = _mm_set1_ps(SAMPLE_24BIT_MAX_F);
 336         __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
 337         __m128 factor = int_max;
 338
 339         unsigned long unrolled = nsamples / 4;
 340         nsamples = nsamples & 3;
 341
 342         while (unrolled--) {
 343                 __m128 in = _mm_load_ps(src);
 344                 __m128 scaled = _mm_mul_ps(in, factor);
 345                 __m128 clipped = clip(scaled, int_min, int_max);
 346
 347                 __m128i y = _mm_cvttps_epi32(clipped);
 348                 __m128i shifted = _mm_slli_epi32(y, 8);
 349
 350 #ifdef __SSE4_1__
 351                 *(int32_t*)dst              = _mm_extract_epi32(shifted, 0);
 352                 *(int32_t*)(dst+dst_skip)   = _mm_extract_epi32(shifted, 1);
 353                 *(int32_t*)(dst+2*dst_skip) = _mm_extract_epi32(shifted, 2);
 354                 *(int32_t*)(dst+3*dst_skip) = _mm_extract_epi32(shifted, 3);
 355 #else
 356                 __m128i shuffled1 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(0, 3, 2, 1));
 357                 __m128i shuffled2 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(1, 0, 3, 2));
 358                 __m128i shuffled3 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(2, 1, 0, 3));
 359
 360                 _mm_store_ss((float*)dst, (__m128)shifted);
 361
 362                 _mm_store_ss((float*)(dst+dst_skip), (__m128)shuffled1);
 363                 _mm_store_ss((float*)(dst+2*dst_skip), (__m128)shuffled2);
 364                 _mm_store_ss((float*)(dst+3*dst_skip), (__m128)shuffled3);
 365 #endif
 366                 dst += 4*dst_skip;
 367
 368                 src+= 4;
 369         }
 370
 371         while (nsamples--) {
 372                 __m128 in = _mm_load_ss(src);
 373                 __m128 scaled = _mm_mul_ss(in, factor);
 374                 __m128 clipped = _mm_min_ss(int_max, _mm_max_ss(scaled, int_min));
 375
 376                 int y = _mm_cvttss_si32(clipped);
 377                 *((int *) dst) = y<<8;
 378
 379                 dst += dst_skip;
 380                 src++;
 381         }
 382
 383 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
 384         unsigned long unrolled = nsamples / 4;
 385         nsamples = nsamples & 3;
 386
 387         while (unrolled--) {
 388                 float32x4_t samples = vld1q_f32(src);
 389                 int32x4_t converted = float_24_neon(samples);
 390                 int32x4_t shifted = vshlq_n_s32(converted, 8);
 391
 392                 switch(dst_skip) {
 393                         case 4:
 394                                 vst1q_s32((int32_t*)dst, shifted);
 395                                 break;
 396                         default:
 397                                 vst1q_lane_s32((int32_t*)(dst),            shifted, 0);
 398                                 vst1q_lane_s32((int32_t*)(dst+dst_skip),   shifted, 1);
 399                                 vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2);
 400                 vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3);
 401                                 break;
 402                 }
 403                 dst += 4*dst_skip;
 404
 405                 src+= 4;
 406         }
 407 #endif
 408
 409 #if !defined (__SSE2__)
 410         while (nsamples--) {
 411                 float_24u32 (*src, *((int32_t*) dst));
 412                 dst += dst_skip;
 413                 src++;
 414         }
 415 #endif
 416 }
 417
 418 void sample_move_dS_s32u24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
 419 {
 420 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
 421         float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
 422         unsigned long unrolled = nsamples / 4;
 423         while (unrolled--) {
 424                 int32x4_t src128;
 425                 switch(src_skip)
 426                 {
 427                         case 4:
 428                                 src128 = vld1q_s32((int32_t*)src);
 429                                 break;
 430                         case 8:
 431                                 src128 = vld2q_s32((int32_t*)src).val[0];
 432                                 break;
 433                         default:
 434                                 src128 = vld1q_lane_s32((int32_t*)src,              src128, 0);
 435                                 src128 = vld1q_lane_s32((int32_t*)(src+src_skip),   src128, 1);
 436                                 src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2);
 437                                 src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3);
 438                                 break;
 439                 }
 440                 src128 = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(src128)));
 441                 int32x4_t shifted = vshrq_n_s32(src128, 8);
 442                 float32x4_t as_float = vcvtq_f32_s32(shifted);
 443                 float32x4_t divided = vmulq_f32(as_float, factor);
 444                 vst1q_f32(dst, divided);
 445
 446                 src += 4*src_skip;
 447                 dst += 4;
 448         }
 449         nsamples = nsamples & 3;
 450 #endif
 451
 452         /* ALERT: signed sign-extension portability !!! */
 453
 454         const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
 455
 456         while (nsamples--) {
 457                 int x;
 458 #if __BYTE_ORDER == __LITTLE_ENDIAN
 459                 x = (unsigned char)(src[0]);
 460                 x <<= 8;
 461                 x |= (unsigned char)(src[1]);
 462                 x <<= 8;
 463                 x |= (unsigned char)(src[2]);
 464                 x <<= 8;
 465                 x |= (unsigned char)(src[3]);
 466 #elif __BYTE_ORDER == __BIG_ENDIAN
 467                 x = (unsigned char)(src[3]);
 468                 x <<= 8;
 469                 x |= (unsigned char)(src[2]);
 470                 x <<= 8;
 471                 x |= (unsigned char)(src[1]);
 472                 x <<= 8;
 473                 x |= (unsigned char)(src[0]);
 474 #endif
 475                 *dst = (x >> 8) * scaling;
 476                 dst++;
 477                 src += src_skip;
 478         }
 479 }
 480
 481 void sample_move_dS_s32u24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
 482 {
 483 #if defined (__SSE2__) && !defined (__sun__)
 484         unsigned long unrolled = nsamples / 4;
 485         static float inv_sample_max_24bit = 1.0 / SAMPLE_24BIT_SCALING;
 486         __m128 factor = _mm_set1_ps(inv_sample_max_24bit);
 487         while (unrolled--)
 488         {
 489                 int i1 = *((int *) src);
 490                 src+= src_skip;
 491                 int i2 = *((int *) src);
 492                 src+= src_skip;
 493                 int i3 = *((int *) src);
 494                 src+= src_skip;
 495                 int i4 = *((int *) src);
 496                 src+= src_skip;
 497
 498                 __m128i src = _mm_set_epi32(i4, i3, i2, i1);
 499                 __m128i shifted = _mm_srai_epi32(src, 8);
 500
 501                 __m128 as_float = _mm_cvtepi32_ps(shifted);
 502                 __m128 divided = _mm_mul_ps(as_float, factor);
 503
 504                 _mm_storeu_ps(dst, divided);
 505
 506                 dst += 4;
 507         }
 508         nsamples = nsamples & 3;
 509 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
 510         unsigned long unrolled = nsamples / 4;
 511         float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
 512         while (unrolled--) {
 513                 int32x4_t src128;
 514                 switch(src_skip) {
 515                         case 4:
 516                                 src128 = vld1q_s32((int32_t*)src);
 517                                 break;
 518                         case 8:
 519                                 src128 = vld2q_s32((int32_t*)src).val[0];
 520                                 break;
 521                         default:
 522                                 src128 = vld1q_lane_s32((int32_t*)src,              src128, 0);
 523                                 src128 = vld1q_lane_s32((int32_t*)(src+src_skip),   src128, 1);
 524                                 src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2);
 525                                 src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3);
 526                                 break;
 527                 }
 528                 int32x4_t shifted = vshrq_n_s32(src128, 8);
 529                 float32x4_t as_float = vcvtq_f32_s32(shifted);
 530                 float32x4_t divided = vmulq_f32(as_float, factor);
 531                 vst1q_f32(dst, divided);
 532
 533                 src += 4*src_skip;
 534                 dst += 4;
 535         }
 536         nsamples = nsamples & 3;
 537 #endif
 538
 539         /* ALERT: signed sign-extension portability !!! */
 540
 541         const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
 542         while (nsamples--) {
 543                 *dst = (*((int *) src) >> 8) * scaling;
 544                 dst++;
 545                 src += src_skip;
 546         }
 547 }
 548
 549 void sample_move_d24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 550 {
 551 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
 552         unsigned long unrolled = nsamples / 4;
 553         while (unrolled--) {
 554                 int i;
 555                 int32_t z[4];
 556                 float32x4_t samples = vld1q_f32(src);
 557                 int32x4_t converted = float_24_neon(samples);
 558                 converted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted)));
 559                 vst1q_s32(z, converted);
 560
 561                 for (i = 0; i != 4; ++i) {
 562                         memcpy (dst, ((char*)(z+i))+1, 3);
 563                         dst += dst_skip;
 564                 }
 565                 src += 4;
 566         }
 567         nsamples = nsamples & 3;
 568 #endif
 569
 570         int32_t z;
 571
 572         while (nsamples--) {
 573                 float_24 (*src, z);
 574 #if __BYTE_ORDER == __LITTLE_ENDIAN
 575                 dst[0]=(char)(z>>16);
 576                 dst[1]=(char)(z>>8);
 577                 dst[2]=(char)(z);
 578 #elif __BYTE_ORDER == __BIG_ENDIAN
 579                 dst[0]=(char)(z);
 580                 dst[1]=(char)(z>>8);
 581                 dst[2]=(char)(z>>16);
 582 #endif
 583                 dst += dst_skip;
 584                 src++;
 585         }
 586 }
 587
 588 void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 589 {
 590 #if defined (__SSE2__) && !defined (__sun__)
 591         _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
 592         while (nsamples >= 4) {
 593                 int i;
 594                 int32_t z[4];
 595                 __m128 samples = _mm_loadu_ps(src);
 596                 __m128i converted = float_24_sse(samples);
 597
 598 #ifdef __SSE4_1__
 599                 z[0] = _mm_extract_epi32(converted, 0);
 600                 z[1] = _mm_extract_epi32(converted, 1);
 601                 z[2] = _mm_extract_epi32(converted, 2);
 602                 z[3] = _mm_extract_epi32(converted, 3);
 603 #else
 604                 __m128i shuffled1 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(0, 3, 2, 1));
 605                 __m128i shuffled2 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(1, 0, 3, 2));
 606                 __m128i shuffled3 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(2, 1, 0, 3));
 607
 608                 _mm_store_ss((float*)z, (__m128)converted);
 609                 _mm_store_ss((float*)z+1, (__m128)shuffled1);
 610                 _mm_store_ss((float*)z+2, (__m128)shuffled2);
 611                 _mm_store_ss((float*)z+3, (__m128)shuffled3);
 612 #endif
 613
 614                 for (i = 0; i != 4; ++i) {
 615                         memcpy (dst, z+i, 3);
 616                         dst += dst_skip;
 617                 }
 618
 619                 nsamples -= 4;
 620                 src += 4;
 621         }
 622 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
 623         unsigned long unrolled = nsamples / 4;
 624         while (unrolled--) {
 625                 int i;
 626                 int32_t z[4];
 627                 float32x4_t samples = vld1q_f32(src);
 628                 int32x4_t converted = float_24_neon(samples);
 629                 vst1q_s32(z, converted);
 630
 631                 for (i = 0; i != 4; ++i) {
 632                         memcpy (dst, z+i, 3);
 633                         dst += dst_skip;
 634                 }
 635                 src += 4;
 636         }
 637         nsamples = nsamples & 3;
 638 #endif
 639
 640     int32_t z;
 641
 642         while (nsamples--) {
 643                 float_24 (*src, z);
 644 #if __BYTE_ORDER == __LITTLE_ENDIAN
 645                 memcpy (dst, &z, 3);
 646 #elif __BYTE_ORDER == __BIG_ENDIAN
 647                 memcpy (dst, (char *)&z + 1, 3);
 648 #endif
 649                 dst += dst_skip;
 650                 src++;
 651         }
 652 }
 653
 654 void sample_move_dS_s24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
 655 {
 656         const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
 657
 658 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
 659         // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
 660         const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
 661         int32_t x[4];
 662         memset(x, 0, sizeof(x));
 663         unsigned long unrolled = nsamples / 4;
 664         while (unrolled--) {
 665 #if __BYTE_ORDER == __BIG_ENDIAN         /* ARM big endian?? */
 666                 // right aligned / inverse sequence below -> *256
 667                 memcpy(((char*)&x[0])+1, src, 3);
 668                 memcpy(((char*)&x[1])+1, src+src_skip, 3);
 669                 memcpy(((char*)&x[2])+1, src+2*src_skip, 3);
 670                 memcpy(((char*)&x[3])+1, src+3*src_skip, 3);
 671 #else
 672                 memcpy(&x[0], src, 3);
 673                 memcpy(&x[1], src+src_skip, 3);
 674                 memcpy(&x[2], src+2*src_skip, 3);
 675                 memcpy(&x[3], src+3*src_skip, 3);
 676 #endif
 677                 src += 4 * src_skip;
 678
 679                 int32x4_t source = vld1q_s32(x);
 680                 source = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(source)));
 681                 float32x4_t converted = vcvtq_f32_s32(source);
 682                 float32x4_t scaled = vmulq_f32(converted, vscaling);
 683                 vst1q_f32(dst, scaled);
 684                 dst += 4;
 685         }
 686         nsamples = nsamples & 3;
 687 #endif
 688
 689         /* ALERT: signed sign-extension portability !!! */
 690
 691         while (nsamples--) {
 692                 int x;
 693 #if __BYTE_ORDER == __LITTLE_ENDIAN
 694                 x = (unsigned char)(src[0]);
 695                 x <<= 8;
 696                 x |= (unsigned char)(src[1]);
 697                 x <<= 8;
 698                 x |= (unsigned char)(src[2]);
 699                 /* correct sign bit and the rest of the top byte */
 700                 if (src[0] & 0x80) {
 701                         x |= 0xff << 24;
 702                 }
 703 #elif __BYTE_ORDER == __BIG_ENDIAN
 704                 x = (unsigned char)(src[2]);
 705                 x <<= 8;
 706                 x |= (unsigned char)(src[1]);
 707                 x <<= 8;
 708                 x |= (unsigned char)(src[0]);
 709                 /* correct sign bit and the rest of the top byte */
 710                 if (src[2] & 0x80) {
 711                         x |= 0xff << 24;
 712                 }
 713 #endif
 714                 *dst = x * scaling;
 715                 dst++;
 716                 src += src_skip;
 717         }
 718 }
 719
 720 void sample_move_dS_s24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
 721 {
 722         const jack_default_audio_sample_t scaling = 1.f/SAMPLE_24BIT_SCALING;
 723
 724 #if defined (__SSE2__) && !defined (__sun__)
 725         const __m128 scaling_block = _mm_set_ps1(scaling);
 726         while (nsamples >= 4) {
 727                 int x0, x1, x2, x3;
 728
 729                 memcpy((char*)&x0 + 1, src, 3);
 730                 memcpy((char*)&x1 + 1, src+src_skip, 3);
 731                 memcpy((char*)&x2 + 1, src+2*src_skip, 3);
 732                 memcpy((char*)&x3 + 1, src+3*src_skip, 3);
 733                 src += 4 * src_skip;
 734
 735                 const __m128i block_i = _mm_set_epi32(x3, x2, x1, x0);
 736                 const __m128i shifted = _mm_srai_epi32(block_i, 8);
 737                 const __m128 converted = _mm_cvtepi32_ps (shifted);
 738                 const __m128 scaled = _mm_mul_ps(converted, scaling_block);
 739                 _mm_storeu_ps(dst, scaled);
 740                 dst += 4;
 741                 nsamples -= 4;
 742         }
 743 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
 744         // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
 745         const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
 746         int32_t x[4];
 747         memset(x, 0, sizeof(x));
 748         unsigned long unrolled = nsamples / 4;
 749         while (unrolled--) {
 750 #if __BYTE_ORDER == __BIG_ENDIAN        /* ARM big endian?? */
 751                 // left aligned -> *256
 752                 memcpy(&x[0], src, 3);
 753                 memcpy(&x[1], src+src_skip, 3);
 754                 memcpy(&x[2], src+2*src_skip, 3);
 755                 memcpy(&x[3], src+3*src_skip, 3);
 756 #else
 757                 memcpy(((char*)&x[0])+1, src, 3);
 758                 memcpy(((char*)&x[1])+1, src+src_skip, 3);
 759                 memcpy(((char*)&x[2])+1, src+2*src_skip, 3);
 760                 memcpy(((char*)&x[3])+1, src+3*src_skip, 3);
 761 #endif
 762                 src += 4 * src_skip;
 763
 764                 int32x4_t source = vld1q_s32(x);
 765                 float32x4_t converted = vcvtq_f32_s32(source);
 766                 float32x4_t scaled = vmulq_f32(converted, vscaling);
 767                 vst1q_f32(dst, scaled);
 768                 dst += 4;
 769         }
 770         nsamples = nsamples & 3;
 771 #endif
 772
 773         while (nsamples--) {
 774                 int x;
 775 #if __BYTE_ORDER == __LITTLE_ENDIAN
 776                 memcpy((char*)&x + 1, src, 3);
 777 #elif __BYTE_ORDER == __BIG_ENDIAN
 778                 memcpy(&x, src, 3);
 779 #endif
 780                 x >>= 8;
 781                 *dst = x * scaling;
 782                 dst++;
 783                 src += src_skip;
 784         }
 785 }
 786
 787
 788 void sample_move_d16_sSs (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 789 {
 790 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
 791         unsigned long unrolled = nsamples / 4;
 792         nsamples = nsamples & 3;
 793
 794         while (unrolled--) {
 795                 float32x4_t samples = vld1q_f32(src);
 796                 int16x4_t converted = float_16_neon(samples);
 797                 converted = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(converted)));
 798
 799                 switch(dst_skip) {
 800                         case 2:
 801                                 vst1_s16((int16_t*)dst, converted);
 802                                 break;
 803                         default:
 804                                 vst1_lane_s16((int16_t*)(dst),            converted, 0);
 805                                 vst1_lane_s16((int16_t*)(dst+dst_skip),   converted, 1);
 806                                 vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2);
 807                                 vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3);
 808                                 break;
 809                 }
 810                 dst += 4*dst_skip;
 811                 src+= 4;
 812         }
 813 #endif
 814         int16_t tmp;
 815
 816         while (nsamples--) {
 817                 // float_16 (*src, tmp);
 818
 819                 if (*src <= NORMALIZED_FLOAT_MIN) {
 820                         tmp = SAMPLE_16BIT_MIN;
 821                 } else if (*src >= NORMALIZED_FLOAT_MAX) {
 822                         tmp = SAMPLE_16BIT_MAX;
 823                 } else {
 824                         tmp = (int16_t) f_round (*src * SAMPLE_16BIT_SCALING);
 825                 }
 826
 827 #if __BYTE_ORDER == __LITTLE_ENDIAN
 828                 dst[0]=(char)(tmp>>8);
 829                 dst[1]=(char)(tmp);
 830 #elif __BYTE_ORDER == __BIG_ENDIAN
 831                 dst[0]=(char)(tmp);
 832                 dst[1]=(char)(tmp>>8);
 833 #endif
 834                 dst += dst_skip;
 835                 src++;
 836         }
 837 }
 838
 839 void sample_move_d16_sS (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 840 {
 841 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
 842         unsigned long unrolled = nsamples / 4;
 843         nsamples = nsamples & 3;
 844
 845         while (unrolled--) {
 846                 float32x4_t samples = vld1q_f32(src);
 847                 int16x4_t converted = float_16_neon(samples);
 848
 849                 switch(dst_skip) {
 850                         case 2:
 851                                 vst1_s16((int16_t*)dst, converted);
 852                                 break;
 853                         default:
 854                                 vst1_lane_s16((int16_t*)(dst),            converted, 0);
 855                                 vst1_lane_s16((int16_t*)(dst+dst_skip),   converted, 1);
 856                                 vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2);
 857                                 vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3);
 858                                 break;
 859                 }
 860                 dst += 4*dst_skip;
 861                 src+= 4;
 862         }
 863 #endif
 864         while (nsamples--) {
 865                 float_16 (*src, *((int16_t*) dst));
 866                 dst += dst_skip;
 867                 src++;
 868         }
 869 }
 870
 871 void sample_move_dither_rect_d16_sSs (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 872 {
 873         jack_default_audio_sample_t val;
 874         int16_t      tmp;
 875
 876         while (nsamples--) {
 877                 val = (*src * SAMPLE_16BIT_SCALING) + fast_rand() / (float) UINT_MAX - 0.5f;
 878                 float_16_scaled (val, tmp);
 879 #if __BYTE_ORDER == __LITTLE_ENDIAN
 880                 dst[0]=(char)(tmp>>8);
 881                 dst[1]=(char)(tmp);
 882 #elif __BYTE_ORDER == __BIG_ENDIAN
 883                 dst[0]=(char)(tmp);
 884                 dst[1]=(char)(tmp>>8);
 885 #endif
 886                 dst += dst_skip;
 887                 src++;
 888         }
 889 }
 890
 891 void sample_move_dither_rect_d16_sS (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 892 {
 893         jack_default_audio_sample_t val;
 894
 895         while (nsamples--) {
 896                 val = (*src * SAMPLE_16BIT_SCALING) + fast_rand() / (float)UINT_MAX - 0.5f;
 897                 float_16_scaled (val, *((int16_t*) dst));
 898                 dst += dst_skip;
 899                 src++;
 900         }
 901 }
 902
 903 void sample_move_dither_tri_d16_sSs (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 904 {
 905         jack_default_audio_sample_t val;
 906         int16_t      tmp;
 907
 908         while (nsamples--) {
 909                 val = (*src * SAMPLE_16BIT_SCALING) + ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
 910                 float_16_scaled (val, tmp);
 911
 912 #if __BYTE_ORDER == __LITTLE_ENDIAN
 913                 dst[0]=(char)(tmp>>8);
 914                 dst[1]=(char)(tmp);
 915 #elif __BYTE_ORDER == __BIG_ENDIAN
 916                 dst[0]=(char)(tmp);
 917                 dst[1]=(char)(tmp>>8);
 918 #endif
 919                 dst += dst_skip;
 920                 src++;
 921         }
 922 }
 923
 924 void sample_move_dither_tri_d16_sS (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 925 {
 926         jack_default_audio_sample_t val;
 927
 928         while (nsamples--) {
 929                 val = (*src * SAMPLE_16BIT_SCALING) + ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
 930                 float_16_scaled (val, *((int16_t*) dst));
 931                 dst += dst_skip;
 932                 src++;
 933         }
 934 }
 935
 936 void sample_move_dither_shaped_d16_sSs (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 937 {
 938         jack_default_audio_sample_t     x;
 939         jack_default_audio_sample_t     xe; /* the innput sample - filtered error */
 940         jack_default_audio_sample_t     xp; /* x' */
 941         float        r;
 942         float        rm1 = state->rm1;
 943         unsigned int idx = state->idx;
 944         int16_t      tmp;
 945
 946         while (nsamples--) {
 947                 x = *src * SAMPLE_16BIT_SCALING;
 948                 r = ((float)fast_rand() + (float)fast_rand())  / (float)UINT_MAX - 1.0f;
 949                 /* Filter the error with Lipshitz's minimally audible FIR:
 950                    [2.033 -2.165 1.959 -1.590 0.6149] */
 951                 xe = x
 952                      - state->e[idx] * 2.033f
 953                      + state->e[(idx - 1) & DITHER_BUF_MASK] * 2.165f
 954                      - state->e[(idx - 2) & DITHER_BUF_MASK] * 1.959f
 955                      + state->e[(idx - 3) & DITHER_BUF_MASK] * 1.590f
 956                      - state->e[(idx - 4) & DITHER_BUF_MASK] * 0.6149f;
 957                 xp = xe + r - rm1;
 958                 rm1 = r;
 959
 960                 float_16_scaled (xp, tmp);
 961
 962                 /* Intrinsic z^-1 delay */
 963                 idx = (idx + 1) & DITHER_BUF_MASK;
 964                 state->e[idx] = xp - xe;
 965
 966 #if __BYTE_ORDER == __LITTLE_ENDIAN
 967                 dst[0]=(char)(tmp>>8);
 968                 dst[1]=(char)(tmp);
 969 #elif __BYTE_ORDER == __BIG_ENDIAN
 970                 dst[0]=(char)(tmp);
 971                 dst[1]=(char)(tmp>>8);
 972 #endif
 973                 dst += dst_skip;
 974                 src++;
 975         }
 976         state->rm1 = rm1;
 977         state->idx = idx;
 978 }
 979
 980 void sample_move_dither_shaped_d16_sS (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 981 {
 982         jack_default_audio_sample_t     x;
 983         jack_default_audio_sample_t     xe; /* the innput sample - filtered error */
 984         jack_default_audio_sample_t     xp; /* x' */
 985         float        r;
 986         float        rm1 = state->rm1;
 987         unsigned int idx = state->idx;
 988
 989         while (nsamples--) {
 990                 x = *src * SAMPLE_16BIT_SCALING;
 991                 r = ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
 992                 /* Filter the error with Lipshitz's minimally audible FIR:
 993                    [2.033 -2.165 1.959 -1.590 0.6149] */
 994                 xe = x
 995                      - state->e[idx] * 2.033f
 996                      + state->e[(idx - 1) & DITHER_BUF_MASK] * 2.165f
 997                      - state->e[(idx - 2) & DITHER_BUF_MASK] * 1.959f
 998                      + state->e[(idx - 3) & DITHER_BUF_MASK] * 1.590f
 999                      - state->e[(idx - 4) & DITHER_BUF_MASK] * 0.6149f;
1000                 xp = xe + r - rm1;
1001                 rm1 = r;
1002
1003                 float_16_scaled (xp, *((int16_t*) dst));
1004
1005                 /* Intrinsic z^-1 delay */
1006                 idx = (idx + 1) & DITHER_BUF_MASK;
1007                 state->e[idx] = *((int16_t*) dst) - xe;
1008
1009                 dst += dst_skip;
1010                 src++;
1011         }
1012         state->rm1 = rm1;
1013         state->idx = idx;
1014 }
1015
1016 void sample_move_dS_s16s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
1017 {
1018         short z;
1019         const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
1020 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1021         const float32x4_t vscaling = vdupq_n_f32(scaling);
1022         unsigned long unrolled = nsamples / 4;
1023         while (unrolled--) {
1024                 int16x4_t source16x4;
1025                 switch(src_skip) {
1026                         case 2:
1027                                 source16x4 = vld1_s16((int16_t*)src);
1028                                 break;
1029                         case 4:
1030                                 source16x4 = vld2_s16((int16_t*)src).val[0];
1031                                 break;
1032                         default:
1033                                 source16x4 = vld1_lane_s16((int16_t*)src,              source16x4, 0);
1034                                 source16x4 = vld1_lane_s16((int16_t*)(src+src_skip),   source16x4, 1);
1035                                 source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2);
1036                                 source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3);
1037                                 break;
1038                 }
1039                 source16x4 = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(source16x4)));
1040                 int32x4_t source32x4 = vmovl_s16(source16x4);
1041                 src += 4 * src_skip;
1042
1043                 float32x4_t converted = vcvtq_f32_s32(source32x4);
1044                 float32x4_t scaled = vmulq_f32(converted, vscaling);
1045                 vst1q_f32(dst, scaled);
1046                 dst += 4;
1047         }
1048         nsamples = nsamples & 3;
1049 #endif
1050
1051         /* ALERT: signed sign-extension portability !!! */
1052         while (nsamples--) {
1053 #if __BYTE_ORDER == __LITTLE_ENDIAN
1054                 z = (unsigned char)(src[0]);
1055                 z <<= 8;
1056                 z |= (unsigned char)(src[1]);
1057 #elif __BYTE_ORDER == __BIG_ENDIAN
1058                 z = (unsigned char)(src[1]);
1059                 z <<= 8;
1060                 z |= (unsigned char)(src[0]);
1061 #endif
1062                 *dst = z * scaling;
1063                 dst++;
1064                 src += src_skip;
1065         }
1066 }
1067
1068 void sample_move_dS_s16 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
1069 {
1070         /* ALERT: signed sign-extension portability !!! */
1071         const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
1072 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1073         const float32x4_t vscaling = vdupq_n_f32(scaling);
1074         unsigned long unrolled = nsamples / 4;
1075         while (unrolled--) {
1076                 int16x4_t source16x4;
1077                 switch(src_skip) {
1078                         case 2:
1079                                 source16x4 = vld1_s16((int16_t*)src);
1080                                 break;
1081                         case 4:
1082                                 source16x4 = vld2_s16((int16_t*)src).val[0];
1083                                 break;
1084                         default:
1085                                 source16x4 = vld1_lane_s16((int16_t*)src,              source16x4, 0);
1086                                 source16x4 = vld1_lane_s16((int16_t*)(src+src_skip),   source16x4, 1);
1087                                 source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2);
1088                                 source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3);
1089                                 break;
1090                 }
1091                 int32x4_t source32x4 = vmovl_s16(source16x4);
1092                 src += 4 * src_skip;
1093
1094                 float32x4_t converted = vcvtq_f32_s32(source32x4);
1095                 float32x4_t scaled = vmulq_f32(converted, vscaling);
1096                 vst1q_f32(dst, scaled);
1097                 dst += 4;
1098         }
1099         nsamples = nsamples & 3;
1100 #endif
1101
1102         while (nsamples--) {
1103                 *dst = (*((short *) src)) * scaling;
1104                 dst++;
1105                 src += src_skip;
1106         }
1107 }
1108
1109 void memset_interleave (char *dst, char val, unsigned long bytes,
1110                         unsigned long unit_bytes,
1111                         unsigned long skip_bytes)
1112 {
1113         switch (unit_bytes) {
1114         case 1:
1115                 while (bytes--) {
1116                         *dst = val;
1117                         dst += skip_bytes;
1118                 }
1119                 break;
1120         case 2:
1121                 while (bytes) {
1122                         *((short *) dst) = (short) val;
1123                         dst += skip_bytes;
1124                         bytes -= 2;
1125                 }
1126                 break;
1127         case 4:
1128                 while (bytes) {
1129                         *((int *) dst) = (int) val;
1130                         dst += skip_bytes;
1131                         bytes -= 4;
1132                 }
1133                 break;
1134         default:
1135                 while (bytes) {
1136                         memset(dst, val, unit_bytes);
1137                         dst += skip_bytes;
1138                         bytes -= unit_bytes;
1139                 }
1140                 break;
1141         }
1142 }
1143
1144 /* COPY FUNCTIONS: used to move data from an input channel to an
1145    output channel. Note that we assume that the skip distance
1146    is the same for both channels. This is completely fine
1147    unless the input and output were on different audio interfaces that
1148    were interleaved differently. We don't try to handle that.
1149 */
1150
1151 void
1152 memcpy_fake (char *dst, char *src, unsigned long src_bytes, unsigned long foo, unsigned long bar)
1153 {
1154         memcpy (dst, src, src_bytes);
1155 }
1156
1157 void
1158 memcpy_interleave_d16_s16 (char *dst, char *src, unsigned long src_bytes,
1159                            unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
1160 {
1161         while (src_bytes) {
1162                 *((short *) dst) = *((short *) src);
1163                 dst += dst_skip_bytes;
1164                 src += src_skip_bytes;
1165                 src_bytes -= 2;
1166         }
1167 }
1168
1169 void
1170 memcpy_interleave_d24_s24 (char *dst, char *src, unsigned long src_bytes,
1171                            unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
1172 {
1173         while (src_bytes) {
1174                 memcpy(dst, src, 3);
1175                 dst += dst_skip_bytes;
1176                 src += src_skip_bytes;
1177                 src_bytes -= 3;
1178         }
1179 }
1180
1181 void
1182 memcpy_interleave_d32_s32 (char *dst, char *src, unsigned long src_bytes,
1183                            unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
1184 {
1185         while (src_bytes) {
1186                 *((int *) dst) = *((int *) src);
1187                 dst += dst_skip_bytes;
1188                 src += src_skip_bytes;
1189                 src_bytes -= 4;
1190         }
1191 }
1192