libs/faudio/src/FAudio_internal_simd.c

   1 /* FAudio - XAudio Reimplementation for FNA
   2  *
   3  * Copyright (c) 2011-2024 Ethan Lee, Luigi Auriemma, and the MonoGame Team
   4  *
   5  * This software is provided 'as-is', without any express or implied warranty.
   6  * In no event will the authors be held liable for any damages arising from
   7  * the use of this software.
   8  *
   9  * Permission is granted to anyone to use this software for any purpose,
  10  * including commercial applications, and to alter it and redistribute it
  11  * freely, subject to the following restrictions:
  12  *
  13  * 1. The origin of this software must not be misrepresented; you must not
  14  * claim that you wrote the original software. If you use this software in a
  15  * product, an acknowledgment in the product documentation would be
  16  * appreciated but is not required.
  17  *
  18  * 2. Altered source versions must be plainly marked as such, and must not be
  19  * misrepresented as being the original software.
  20  *
  21  * 3. This notice may not be removed or altered from any source distribution.
  22  *
  23  * Ethan "flibitijibibo" Lee <flibitijibibo@flibitijibibo.com>
  24  *
  25  */
  26
  27 #include "FAudio_internal.h"
  28
  29 /* SECTION 0: SSE/NEON Detection */
  30
  31 /* The SSE/NEON detection comes from MojoAL:
  32  * https://hg.icculus.org/icculus/mojoAL/file/default/mojoal.c
  33  */
  34
  35 #if defined(__x86_64__) || defined(_M_X64)
  36         /* Some platforms fail to define this... */
  37         #ifndef __SSE2__
  38         #define __SSE2__ 1
  39         #endif
  40
  41         /* x86_64 guarantees SSE2. */
  42         #define NEED_SCALAR_CONVERTER_FALLBACKS 0
  43 #elif defined(__aarch64__) || defined(_M_ARM64)
  44         /* Some platforms fail to define this... */
  45         #ifndef __ARM_NEON__
  46         #define __ARM_NEON__ 1
  47         #endif
  48
  49         /* AArch64 guarantees NEON. */
  50         #define NEED_SCALAR_CONVERTER_FALLBACKS 0
  51 #elif __MACOSX__ && !defined(__POWERPC__)
  52         /* Some build systems may need to specify this. */
  53         #if !defined(__SSE2__) && !defined(__ARM_NEON__)
  54         #error macOS does not have SSE2/NEON? Bad compiler?
  55         #endif
  56
  57         /* Mac OS X/Intel guarantees SSE2. */
  58         #define NEED_SCALAR_CONVERTER_FALLBACKS 0
  59 #else
  60         /* Need plain C implementations to support all other hardware */
  61         #define NEED_SCALAR_CONVERTER_FALLBACKS 1
  62 #endif
  63
  64 /* Our NEON paths require AArch64, don't check __ARM_NEON__ here */
  65 #if defined(__aarch64__) || defined(_M_ARM64)
  66 #include <arm_neon.h>
  67 #define HAVE_NEON_INTRINSICS 1
  68 #endif
  69
  70
  71 #ifdef __SSE2__
  72 #include <emmintrin.h>
  73 #define HAVE_SSE2_INTRINSICS 1
  74 #endif
  75
  76 /* SECTION 1: Type Converters */
  77
  78 /* The SSE/NEON converters are based on SDL_audiotypecvt:
  79  * https://hg.libsdl.org/SDL/file/default/src/audio/SDL_audiotypecvt.c
  80  */
  81
  82 #define DIVBY128 0.0078125f
  83 #define DIVBY32768 0.000030517578125f
  84 #define DIVBY8388607 0.00000011920930376163766f
  85
  86 #if NEED_SCALAR_CONVERTER_FALLBACKS
  87 void FAudio_INTERNAL_Convert_U8_To_F32_Scalar(
  88         const uint8_t *restrict src,
  89         float *restrict dst,
  90         uint32_t len
  91 ) {
  92         uint32_t i;
  93         for (i = 0; i < len; i += 1)
  94         {
  95                 *dst++ = (*src++ * DIVBY128) - 1.0f;
  96         }
  97 }
  98
  99 void FAudio_INTERNAL_Convert_S16_To_F32_Scalar(
 100         const int16_t *restrict src,
 101         float *restrict dst,
 102         uint32_t len
 103 ) {
 104         uint32_t i;
 105         for (i = 0; i < len; i += 1)
 106         {
 107                 *dst++ = *src++ * DIVBY32768;
 108         }
 109 }
 110
 111 void FAudio_INTERNAL_Convert_S32_To_F32_Scalar(
 112         const int32_t *restrict src,
 113         float *restrict dst,
 114         uint32_t len
 115 ) {
 116         uint32_t i;
 117         for (i = 0; i < len; i += 1)
 118         {
 119                 *dst++ = (*src++ >> 8) * DIVBY8388607;
 120         }
 121 }
 122 #endif /* NEED_SCALAR_CONVERTER_FALLBACKS */
 123
 124 #if HAVE_SSE2_INTRINSICS
 125 void FAudio_INTERNAL_Convert_U8_To_F32_SSE2(
 126         const uint8_t *restrict src,
 127         float *restrict dst,
 128         uint32_t len
 129 ) {
 130     int i;
 131     src += len - 1;
 132     dst += len - 1;
 133
 134     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
 135     for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
 136         *dst = (((float) *src) * DIVBY128) - 1.0f;
 137     }
 138
 139     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
 140     FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
 141
 142     /* Make sure src is aligned too. */
 143     if ((((size_t) src) & 15) == 0) {
 144         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
 145         const __m128i *mmsrc = (const __m128i *) src;
 146         const __m128i zero = _mm_setzero_si128();
 147         const __m128 divby128 = _mm_set1_ps(DIVBY128);
 148         const __m128 minus1 = _mm_set1_ps(-1.0f);
 149         while (i >= 16) {   /* 16 * 8-bit */
 150             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
 151             /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
 152             const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
 153             /* right-shift-zero-extend gets us uint16 with the other set of values. */
 154             const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
 155             /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
 156             /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
 157             const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
 158             const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
 159             const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
 160             const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
 161             /* Interleave back into correct order, store. */
 162             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
 163             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
 164             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
 165             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
 166             i -= 16; mmsrc--; dst -= 16;
 167         }
 168
 169         src = (const uint8_t *) mmsrc;
 170     }
 171
 172     src += 15; dst += 15;  /* adjust for any scalar finishing. */
 173
 174     /* Finish off any leftovers with scalar operations. */
 175     while (i) {
 176         *dst = (((float) *src) * DIVBY128) - 1.0f;
 177         i--; src--; dst--;
 178     }
 179 }
 180
 181 void FAudio_INTERNAL_Convert_S16_To_F32_SSE2(
 182         const int16_t *restrict src,
 183         float *restrict dst,
 184         uint32_t len
 185 ) {
 186     int i;
 187     src += len - 1;
 188     dst += len - 1;
 189
 190     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
 191     for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
 192         *dst = ((float) *src) * DIVBY32768;
 193     }
 194
 195     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
 196     FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
 197
 198     /* Make sure src is aligned too. */
 199     if ((((size_t) src) & 15) == 0) {
 200         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
 201         const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
 202         while (i >= 8) {   /* 8 * 16-bit */
 203             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
 204             /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
 205             const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
 206             /* right-shift-sign-extend gets us sint32 with the other set of values. */
 207             const __m128i b = _mm_srai_epi32(ints, 16);
 208             /* Interleave these back into the right order, convert to float, multiply, store. */
 209             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
 210             _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
 211             i -= 8; src -= 8; dst -= 8;
 212         }
 213     }
 214
 215     src += 7; dst += 7;  /* adjust for any scalar finishing. */
 216
 217     /* Finish off any leftovers with scalar operations. */
 218     while (i) {
 219         *dst = ((float) *src) * DIVBY32768;
 220         i--; src--; dst--;
 221     }
 222 }
 223
 224 void FAudio_INTERNAL_Convert_S32_To_F32_SSE2(
 225         const int32_t *restrict src,
 226         float *restrict dst,
 227         uint32_t len
 228 ) {
 229     int i;
 230
 231     /* Get dst aligned to 16 bytes */
 232     for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) {
 233         *dst = ((float) (*src>>8)) * DIVBY8388607;
 234     }
 235
 236     FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
 237
 238     /* Make sure src is aligned too. */
 239     if ((((size_t) src) & 15) == 0) {
 240         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
 241         const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
 242         const __m128i *mmsrc = (const __m128i *) src;
 243         while (i >= 4) {   /* 4 * sint32 */
 244             /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
 245             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
 246             i -= 4; mmsrc++; dst += 4;
 247         }
 248         src = (const int32_t *) mmsrc;
 249     }
 250
 251     /* Finish off any leftovers with scalar operations. */
 252     while (i) {
 253         *dst = ((float) (*src>>8)) * DIVBY8388607;
 254         i--; src++; dst++;
 255     }
 256 }
 257 #endif /* HAVE_SSE2_INTRINSICS */
 258
 259 #if HAVE_NEON_INTRINSICS
 260 void FAudio_INTERNAL_Convert_U8_To_F32_NEON(
 261         const uint8_t *restrict src,
 262         float *restrict dst,
 263         uint32_t len
 264 ) {
 265     int i;
 266     src += len - 1;
 267     dst += len - 1;
 268
 269     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
 270     for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
 271         *dst = (((float) *src) * DIVBY128) - 1.0f;
 272     }
 273
 274     src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */
 275     FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
 276
 277     /* Make sure src is aligned too. */
 278     if ((((size_t) src) & 15) == 0) {
 279         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
 280         const uint8_t *mmsrc = (const uint8_t *) src;
 281         const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
 282         const float32x4_t negone = vdupq_n_f32(-1.0f);
 283         while (i >= 16) {   /* 16 * 8-bit */
 284             const uint8x16_t bytes = vld1q_u8(mmsrc);  /* get 16 uint8 into a NEON register. */
 285             const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes));  /* convert top 8 bytes to 8 uint16 */
 286             const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes));   /* convert bottom 8 bytes to 8 uint16 */
 287             /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
 288             vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128));
 289             vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128));
 290             vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128));
 291             vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128));
 292             i -= 16; mmsrc -= 16; dst -= 16;
 293         }
 294
 295         src = (const uint8_t *) mmsrc;
 296     }
 297
 298     src += 15; dst += 15;  /* adjust for any scalar finishing. */
 299
 300     /* Finish off any leftovers with scalar operations. */
 301     while (i) {
 302         *dst = (((float) *src) * DIVBY128) - 1.0f;
 303         i--; src--; dst--;
 304     }
 305 }
 306
 307 void FAudio_INTERNAL_Convert_S16_To_F32_NEON(
 308         const int16_t *restrict src,
 309         float *restrict dst,
 310         uint32_t len
 311 ) {
 312     int i;
 313     src += len - 1;
 314     dst += len - 1;
 315
 316     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
 317     for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
 318         *dst = ((float) *src) * DIVBY32768;
 319     }
 320
 321     src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */
 322     FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
 323
 324     /* Make sure src is aligned too. */
 325     if ((((size_t) src) & 15) == 0) {
 326         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
 327         const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
 328         while (i >= 8) {   /* 8 * 16-bit */
 329             const int16x8_t ints = vld1q_s16((int16_t const *) src);  /* get 8 sint16 into a NEON register. */
 330             /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
 331             vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));
 332             vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));
 333             i -= 8; src -= 8; dst -= 8;
 334         }
 335     }
 336
 337     src += 7; dst += 7;  /* adjust for any scalar finishing. */
 338
 339     /* Finish off any leftovers with scalar operations. */
 340     while (i) {
 341         *dst = ((float) *src) * DIVBY32768;
 342         i--; src--; dst--;
 343     }
 344 }
 345
 346 void FAudio_INTERNAL_Convert_S32_To_F32_NEON(
 347         const int32_t *restrict src,
 348         float *restrict dst,
 349         uint32_t len
 350 ) {
 351     int i;
 352
 353     /* Get dst aligned to 16 bytes */
 354     for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) {
 355         *dst = ((float) (*src>>8)) * DIVBY8388607;
 356     }
 357
 358     FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
 359
 360     /* Make sure src is aligned too. */
 361     if ((((size_t) src) & 15) == 0) {
 362         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
 363         const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);
 364         const int32_t *mmsrc = (const int32_t *) src;
 365         while (i >= 4) {   /* 4 * sint32 */
 366             /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
 367             vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));
 368             i -= 4; mmsrc += 4; dst += 4;
 369         }
 370         src = (const int32_t *) mmsrc;
 371     }
 372
 373     /* Finish off any leftovers with scalar operations. */
 374     while (i) {
 375         *dst = ((float) (*src>>8)) * DIVBY8388607;
 376         i--; src++; dst++;
 377     }
 378 }
 379 #endif /* HAVE_NEON_INTRINSICS */
 380
 381 /* SECTION 2: Linear Resamplers */
 382
 383 void FAudio_INTERNAL_ResampleGeneric(
 384         float *restrict dCache,
 385         float *restrict resampleCache,
 386         uint64_t *resampleOffset,
 387         uint64_t resampleStep,
 388         uint64_t toResample,
 389         uint8_t channels
 390 ) {
 391         uint32_t i, j;
 392         uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;
 393         for (i = 0; i < toResample; i += 1)
 394         {
 395                 for (j = 0; j < channels; j += 1)
 396                 {
 397                         /* lerp, then convert to float value */
 398                         *resampleCache++ = (float) (
 399                                 dCache[j] +
 400                                 (dCache[j + channels] - dCache[j]) *
 401                                 FIXED_TO_DOUBLE(cur)
 402                         );
 403                 }
 404
 405                 /* Increment fraction offset by the stepping value */
 406                 *resampleOffset += resampleStep;
 407                 cur += resampleStep;
 408
 409                 /* Only increment the sample offset by integer values.
 410                  * Sometimes this will be 0 until cur accumulates
 411                  * enough steps, especially for "slow" rates.
 412                  */
 413                 dCache += (cur >> FIXED_PRECISION) * channels;
 414
 415                 /* Now that any integer has been added, drop it.
 416                  * The offset pointer will preserve the total.
 417                  */
 418                 cur &= FIXED_FRACTION_MASK;
 419         }
 420 }
 421
 422 #if NEED_SCALAR_CONVERTER_FALLBACKS
 423 void FAudio_INTERNAL_ResampleMono_Scalar(
 424         float *restrict dCache,
 425         float *restrict resampleCache,
 426         uint64_t *resampleOffset,
 427         uint64_t resampleStep,
 428         uint64_t toResample,
 429         uint8_t UNUSED
 430 ) {
 431         uint32_t i;
 432         uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;
 433         for (i = 0; i < toResample; i += 1)
 434         {
 435                 /* lerp, then convert to float value */
 436                 *resampleCache++ = (float) (
 437                         dCache[0] +
 438                         (dCache[1] - dCache[0]) *
 439                         FIXED_TO_DOUBLE(cur)
 440                 );
 441
 442                 /* Increment fraction offset by the stepping value */
 443                 *resampleOffset += resampleStep;
 444                 cur += resampleStep;
 445
 446                 /* Only increment the sample offset by integer values.
 447                  * Sometimes this will be 0 until cur accumulates
 448                  * enough steps, especially for "slow" rates.
 449                  */
 450                 dCache += (cur >> FIXED_PRECISION);
 451
 452                 /* Now that any integer has been added, drop it.
 453                  * The offset pointer will preserve the total.
 454                  */
 455                 cur &= FIXED_FRACTION_MASK;
 456         }
 457 }
 458
 459 void FAudio_INTERNAL_ResampleStereo_Scalar(
 460         float *restrict dCache,
 461         float *restrict resampleCache,
 462         uint64_t *resampleOffset,
 463         uint64_t resampleStep,
 464         uint64_t toResample,
 465         uint8_t UNUSED
 466 ) {
 467         uint32_t i;
 468         uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;
 469         for (i = 0; i < toResample; i += 1)
 470         {
 471                 /* lerp, then convert to float value */
 472                 *resampleCache++ = (float) (
 473                         dCache[0] +
 474                         (dCache[2] - dCache[0]) *
 475                         FIXED_TO_DOUBLE(cur)
 476                 );
 477                 *resampleCache++ = (float) (
 478                         dCache[1] +
 479                         (dCache[3] - dCache[1]) *
 480                         FIXED_TO_DOUBLE(cur)
 481                 );
 482
 483                 /* Increment fraction offset by the stepping value */
 484                 *resampleOffset += resampleStep;
 485                 cur += resampleStep;
 486
 487                 /* Only increment the sample offset by integer values.
 488                  * Sometimes this will be 0 until cur accumulates
 489                  * enough steps, especially for "slow" rates.
 490                  */
 491                 dCache += (cur >> FIXED_PRECISION) * 2;
 492
 493                 /* Now that any integer has been added, drop it.
 494                  * The offset pointer will preserve the total.
 495                  */
 496                 cur &= FIXED_FRACTION_MASK;
 497         }
 498 }
 499 #endif /* NEED_SCALAR_CONVERTER_FALLBACKS */
 500
 501 /* The SSE2 versions of the resamplers come from @8thMage! */
 502
 503 #if HAVE_SSE2_INTRINSICS
 504 void FAudio_INTERNAL_ResampleMono_SSE2(
 505         float *restrict dCache,
 506         float *restrict resampleCache,
 507         uint64_t *resampleOffset,
 508         uint64_t resampleStep,
 509         uint64_t toResample,
 510         uint8_t UNUSED
 511 ) {
 512         uint32_t i, header, tail;
 513         uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3;
 514         float *dCache_1, *dCache_2, *dCache_3;
 515         uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
 516         __m128 one_over_fixed_one, half, current_next_0_1, current_next_2_3,
 517                 current, next, sub, cur_fixed, mul, res;
 518         __m128i cur_frac, adder_frac, adder_frac_loop;
 519
 520         /* This is the header, the Dest needs to be aligned to 16B */
 521         header = (16 - ((size_t) resampleCache) % 16) / 4;
 522         if (header == 4)
 523         {
 524                 header = 0;
 525         }
 526         for (i = 0; i < header; i += 1)
 527         {
 528                 /* lerp, then convert to float value */
 529                 *resampleCache++ = (float) (
 530                         dCache[0] +
 531                         (dCache[1] - dCache[0]) *
 532                         FIXED_TO_FLOAT(cur_scalar)
 533                 );
 534
 535                 /* Increment fraction offset by the stepping value */
 536                 *resampleOffset += resampleStep;
 537                 cur_scalar += resampleStep;
 538
 539                 /* Only increment the sample offset by integer values.
 540                  * Sometimes this will be 0 until cur accumulates
 541                  * enough steps, especially for "slow" rates.
 542                  */
 543                 dCache += (cur_scalar >> FIXED_PRECISION);
 544
 545                 /* Now that any integer has been added, drop it.
 546                  * The offset pointer will preserve the total.
 547                  */
 548                 cur_scalar &= FIXED_FRACTION_MASK;
 549         }
 550
 551         toResample -= header;
 552
 553         /* initialising the varius cur
 554          * cur_frac is the fractional part of cur with 4 samples. as the
 555          * fractional part is 32 bit unsigned value, it can be just added
 556          * and the modulu operation for keeping the fractional part will be implicit.
 557          * the 0.5 is for converting signed values to float (no unsigned convert),
 558          * the 0.5 is added later.
 559          */
 560         cur_frac = _mm_set1_epi32(
 561                 (uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
 562         );
 563         adder_frac = _mm_setr_epi32(
 564                 0,
 565                 (uint32_t) (resampleStep & FIXED_FRACTION_MASK),
 566                 (uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK),
 567                 (uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK)
 568         );
 569         cur_frac = _mm_add_epi32(cur_frac, adder_frac);
 570
 571         /* The various cur_scalar is for the different samples
 572          * (1, 2, 3 compared to original cur_scalar = 0)
 573          */
 574         cur_scalar_1 = cur_scalar + resampleStep;
 575         cur_scalar_2 = cur_scalar + resampleStep * 2;
 576         cur_scalar_3 = cur_scalar + resampleStep * 3;
 577         dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION);
 578         dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION);
 579         dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION);
 580         cur_scalar &= FIXED_FRACTION_MASK;
 581         cur_scalar_1 &= FIXED_FRACTION_MASK;
 582         cur_scalar_2 &= FIXED_FRACTION_MASK;
 583         cur_scalar_3 &= FIXED_FRACTION_MASK;
 584
 585         /* FIXME: These should be _mm_undefined_ps! */
 586         current_next_0_1 = _mm_setzero_ps();
 587         current_next_2_3 = _mm_setzero_ps();
 588
 589         /* Constants */
 590         one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE);
 591         half = _mm_set1_ps(0.5f);
 592         adder_frac_loop = _mm_set1_epi32(
 593                 (uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK)
 594         );
 595
 596         tail = toResample % 4;
 597         for (i = 0; i < toResample - tail; i += 4, resampleCache += 4)
 598         {
 599                 /* current next holds 2 pairs of the sample and the sample + 1
 600                  * after that need to seperate them.
 601                  */
 602
 603                 current_next_0_1 = _mm_loadl_pi(current_next_0_1, (__m64*) dCache);
 604                 current_next_0_1 = _mm_loadh_pi(current_next_0_1, (__m64*) dCache_1);
 605                 current_next_2_3 = _mm_loadl_pi(current_next_2_3, (__m64*) dCache_2);
 606                 current_next_2_3 = _mm_loadh_pi(current_next_2_3, (__m64*) dCache_3);
 607
 608                 /* Unpack them to have seperate current and next in 2 vectors. */
 609                 current = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0x88); /* 0b1000 */
 610                 next = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0xdd); /* 0b1101 */
 611
 612                 sub = _mm_sub_ps(next, current);
 613
 614                 /* Convert the fractional part to float and then mul to get the fractions out.
 615                  * then add back the 0.5 we subtracted before.
 616                  */
 617                 cur_fixed = _mm_add_ps(
 618                         _mm_mul_ps(
 619                                 _mm_cvtepi32_ps(cur_frac),
 620                                 one_over_fixed_one
 621                         ),
 622                         half
 623                 );
 624                 mul = _mm_mul_ps(sub, cur_fixed);
 625                 res = _mm_add_ps(current, mul);
 626
 627                 /* Store back */
 628                 _mm_store_ps(resampleCache, res);
 629
 630                 /* Update dCaches for next iteration */
 631                 cur_scalar += resampleStep * 4;
 632                 cur_scalar_1 += resampleStep * 4;
 633                 cur_scalar_2 += resampleStep * 4;
 634                 cur_scalar_3 += resampleStep * 4;
 635                 dCache = dCache + (cur_scalar >> FIXED_PRECISION);
 636                 dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION);
 637                 dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION);
 638                 dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION);
 639                 cur_scalar &= FIXED_FRACTION_MASK;
 640                 cur_scalar_1 &= FIXED_FRACTION_MASK;
 641                 cur_scalar_2 &= FIXED_FRACTION_MASK;
 642                 cur_scalar_3 &= FIXED_FRACTION_MASK;
 643
 644                 cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop);
 645         }
 646         *resampleOffset += resampleStep * (toResample - tail);
 647
 648         /* This is the tail. */
 649         for (i = 0; i < tail; i += 1)
 650         {
 651                 /* lerp, then convert to float value */
 652                 *resampleCache++ = (float) (
 653                         dCache[0] +
 654                         (dCache[1] - dCache[0]) *
 655                         FIXED_TO_FLOAT(cur_scalar)
 656                 );
 657
 658                 /* Increment fraction offset by the stepping value */
 659                 *resampleOffset += resampleStep;
 660                 cur_scalar += resampleStep;
 661
 662                 /* Only increment the sample offset by integer values.
 663                  * Sometimes this will be 0 until cur accumulates
 664                  * enough steps, especially for "slow" rates.
 665                  */
 666                 dCache += (cur_scalar >> FIXED_PRECISION);
 667
 668                 /* Now that any integer has been added, drop it.
 669                  * The offset pointer will preserve the total.
 670                  */
 671                 cur_scalar &= FIXED_FRACTION_MASK;
 672         }
 673 }
 674
 675 void FAudio_INTERNAL_ResampleStereo_SSE2(
 676         float *restrict dCache,
 677         float *restrict resampleCache,
 678         uint64_t *resampleOffset,
 679         uint64_t resampleStep,
 680         uint64_t toResample,
 681         uint8_t UNUSED
 682 ) {
 683         uint32_t i, header, tail;
 684         uint64_t cur_scalar, cur_scalar_1;
 685         float *dCache_1;
 686         __m128 one_over_fixed_one, half, current_next_1, current_next_2,
 687                 current, next, sub, cur_fixed, mul, res;
 688         __m128i cur_frac, adder_frac, adder_frac_loop;
 689
 690         /* This is the header, the Dest needs to be aligned to 16B */
 691         header = (16 - ((size_t) resampleCache) % 16) / 8;
 692         if (header == 2)
 693         {
 694                 header = 0;
 695         }
 696         cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
 697         for (i = 0; i < header; i += 2)
 698         {
 699                 /* lerp, then convert to float value */
 700                 *resampleCache++ = (float) (
 701                         dCache[0] +
 702                         (dCache[2] - dCache[0]) *
 703                         FIXED_TO_FLOAT(cur_scalar)
 704                 );
 705                 *resampleCache++ = (float) (
 706                         dCache[1] +
 707                         (dCache[3] - dCache[1]) *
 708                         FIXED_TO_FLOAT(cur_scalar)
 709                 );
 710
 711                 /* Increment fraction offset by the stepping value */
 712                 *resampleOffset += resampleStep;
 713                 cur_scalar += resampleStep;
 714
 715                 /* Only increment the sample offset by integer values.
 716                  * Sometimes this will be 0 until cur accumulates
 717                  * enough steps, especially for "slow" rates.
 718                  */
 719                 dCache += (cur_scalar >> FIXED_PRECISION) * 2;
 720
 721                 /* Now that any integer has been added, drop it.
 722                  * The offset pointer will preserve the total.
 723                  */
 724                 cur_scalar &= FIXED_FRACTION_MASK;
 725         }
 726
 727         toResample -= header;
 728
 729         /* initialising the varius cur.
 730          * cur_frac holds the fractional part of cur.
 731          * to avoid duplication please see the mono part for a thorough
 732          * explanation.
 733          */
 734         cur_frac = _mm_set1_epi32(
 735                 (uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
 736         );
 737         adder_frac = _mm_setr_epi32(
 738                 0,
 739                 0,
 740                 (uint32_t) (resampleStep & FIXED_FRACTION_MASK),
 741                 (uint32_t) (resampleStep & FIXED_FRACTION_MASK)
 742         );
 743         cur_frac = _mm_add_epi32(cur_frac, adder_frac);
 744
 745         /* dCache_1 is the pointer for dcache in the next resample pos. */
 746         cur_scalar_1 = cur_scalar + resampleStep;
 747         dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2;
 748         cur_scalar_1 &= FIXED_FRACTION_MASK;
 749
 750         one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE);
 751         half = _mm_set1_ps(0.5f);
 752         adder_frac_loop = _mm_set1_epi32(
 753                 (uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK)
 754         );
 755
 756         tail = toResample % 2;
 757         for (i = 0; i < toResample - tail; i += 2, resampleCache += 4)
 758         {
 759                 /* Current_next_1 and current_next_2 each holds 4 src
 760                  * sample points for getting 4 dest resample point at the end.
 761                  * current_next_1 holds:
 762                  * (current_ch_1, current_ch_2, next_ch_1, next_ch_2)
 763                  * for the first resample position, while current_next_2 holds
 764                  * the same for the 2nd resample position
 765                  */
 766                 current_next_1 = _mm_loadu_ps(dCache); /* A1B1A2B2 */
 767                 current_next_2 = _mm_loadu_ps(dCache_1); /* A3B3A4B4 */
 768
 769                 /* Unpack them to get the current and the next in seperate vectors. */
 770                 current = _mm_castpd_ps(
 771                         _mm_unpacklo_pd(
 772                                 _mm_castps_pd(current_next_1),
 773                                 _mm_castps_pd(current_next_2)
 774                         )
 775                 );
 776                 next = _mm_castpd_ps(
 777                         _mm_unpackhi_pd(
 778                                 _mm_castps_pd(current_next_1),
 779                                 _mm_castps_pd(current_next_2)
 780                         )
 781                 );
 782
 783                 sub = _mm_sub_ps(next, current);
 784
 785                 /* Adding the 0.5 back.
 786                  * See mono explanation for more elaborate explanation.
 787                  */
 788                 cur_fixed = _mm_add_ps(
 789                         _mm_mul_ps(
 790                                 _mm_cvtepi32_ps(cur_frac),
 791                                 one_over_fixed_one
 792                         ),
 793                         half
 794                 );
 795                 mul = _mm_mul_ps(sub, cur_fixed);
 796                 res = _mm_add_ps(current, mul);
 797
 798                 /* Store the results */
 799                 _mm_store_ps(resampleCache, res);
 800
 801                 /* Update dCaches for next iteration */
 802                 cur_scalar += resampleStep * 2;
 803                 cur_scalar_1 += resampleStep * 2;
 804                 dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2;
 805                 dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2;
 806                 cur_scalar &= FIXED_FRACTION_MASK;
 807                 cur_scalar_1 &= FIXED_FRACTION_MASK;
 808
 809                 cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop);
 810         }
 811         *resampleOffset += resampleStep * (toResample - tail);
 812
 813         /* This is the tail. */
 814         for (i = 0; i < tail; i += 1)
 815         {
 816                 /* lerp, then convert to float value */
 817                 *resampleCache++ = (float) (
 818                         dCache[0] +
 819                         (dCache[2] - dCache[0]) *
 820                         FIXED_TO_FLOAT(cur_scalar)
 821                 );
 822                 *resampleCache++ = (float) (
 823                         dCache[1] +
 824                         (dCache[3] - dCache[1]) *
 825                         FIXED_TO_FLOAT(cur_scalar)
 826                 );
 827
 828                 /* Increment fraction offset by the stepping value */
 829                 *resampleOffset += resampleStep;
 830                 cur_scalar += resampleStep;
 831
 832                 /* Only increment the sample offset by integer values.
 833                  * Sometimes this will be 0 until cur accumulates
 834                  * enough steps, especially for "slow" rates.
 835                  */
 836                 dCache += (cur_scalar >> FIXED_PRECISION) * 2;
 837
 838                 /* Now that any integer has been added, drop it.
 839                  * The offset pointer will preserve the total.
 840                  */
 841                 cur_scalar &= FIXED_FRACTION_MASK;
 842         }
 843 }
 844 #endif /* HAVE_SSE2_INTRINSICS */
 845
 846 #if HAVE_NEON_INTRINSICS
 847 void FAudio_INTERNAL_ResampleMono_NEON(
 848         float *restrict dCache,
 849         float *restrict resampleCache,
 850         uint64_t *resampleOffset,
 851         uint64_t resampleStep,
 852         uint64_t toResample,
 853         uint8_t UNUSED
 854 ) {
 855         uint32_t i, header, tail;
 856         uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3;
 857         float *dCache_1, *dCache_2, *dCache_3;
 858         uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
 859         float32x4_t one_over_fixed_one, half, current_next_0_1, current_next_2_3,
 860                 current, next, sub, cur_fixed, mul, res;
 861         int32x4_t cur_frac, adder_frac, adder_frac_loop;
 862
 863         /* This is the header, the Dest needs to be aligned to 16B */
 864         header = (16 - ((size_t) resampleCache) % 16) / 4;
 865         if (header == 4)
 866         {
 867                 header = 0;
 868         }
 869         for (i = 0; i < header; i += 1)
 870         {
 871                 /* lerp, then convert to float value */
 872                 *resampleCache++ = (float) (
 873                         dCache[0] +
 874                         (dCache[1] - dCache[0]) *
 875                         FIXED_TO_FLOAT(cur_scalar)
 876                 );
 877
 878                 /* Increment fraction offset by the stepping value */
 879                 *resampleOffset += resampleStep;
 880                 cur_scalar += resampleStep;
 881
 882                 /* Only increment the sample offset by integer values.
 883                  * Sometimes this will be 0 until cur accumulates
 884                  * enough steps, especially for "slow" rates.
 885                  */
 886                 dCache += (cur_scalar >> FIXED_PRECISION);
 887
 888                 /* Now that any integer has been added, drop it.
 889                  * The offset pointer will preserve the total.
 890                  */
 891                 cur_scalar &= FIXED_FRACTION_MASK;
 892         }
 893
 894         toResample -= header;
 895
 896         /* initialising the varius cur
 897          * cur_frac is the fractional part of cur with 4 samples. as the
 898          * fractional part is 32 bit unsigned value, it can be just added
 899          * and the modulu operation for keeping the fractional part will be implicit.
 900          * the 0.5 is for converting signed values to float (no unsigned convert),
 901          * the 0.5 is added later.
 902          */
 903         cur_frac = vdupq_n_s32(
 904                 (uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
 905         );
 906         int32_t __attribute__((aligned(16))) data[4] =
 907         {
 908                 0,
 909                 (uint32_t) (resampleStep & FIXED_FRACTION_MASK),
 910                 (uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK),
 911                 (uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK)
 912         };
 913         adder_frac = vld1q_s32(data);
 914         cur_frac = vaddq_s32(cur_frac, adder_frac);
 915
 916         /* The various cur_scalar is for the different samples
 917          * (1, 2, 3 compared to original cur_scalar = 0)
 918          */
 919         cur_scalar_1 = cur_scalar + resampleStep;
 920         cur_scalar_2 = cur_scalar + resampleStep * 2;
 921         cur_scalar_3 = cur_scalar + resampleStep * 3;
 922         dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION);
 923         dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION);
 924         dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION);
 925         cur_scalar &= FIXED_FRACTION_MASK;
 926         cur_scalar_1 &= FIXED_FRACTION_MASK;
 927         cur_scalar_2 &= FIXED_FRACTION_MASK;
 928         cur_scalar_3 &= FIXED_FRACTION_MASK;
 929
 930         /* Constants */
 931         one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE);
 932         half = vdupq_n_f32(0.5f);
 933         adder_frac_loop = vdupq_n_s32(
 934                 (uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK)
 935         );
 936
 937         tail = toResample % 4;
 938         for (i = 0; i < toResample - tail; i += 4, resampleCache += 4)
 939         {
 940                 /* current next holds 2 pairs of the sample and the sample + 1
 941                  * after that need to separate them.
 942                  */
 943                 current_next_0_1 = vcombine_f32(
 944                         vld1_f32(dCache),
 945                         vld1_f32(dCache_1)
 946                 );
 947                 current_next_2_3 = vcombine_f32(
 948                         vld1_f32(dCache_2),
 949                         vld1_f32(dCache_3)
 950                 );
 951
 952                 /* Unpack them to have seperate current and next in 2 vectors. */
 953                 current = vuzp1q_f32(current_next_0_1, current_next_2_3);
 954                 next = vuzp2q_f32(current_next_0_1, current_next_2_3);
 955
 956                 sub = vsubq_f32(next, current);
 957
 958                 /* Convert the fractional part to float and then mul to get the fractions out.
 959                  * then add back the 0.5 we subtracted before.
 960                  */
 961                 cur_fixed = vaddq_f32(
 962                         vmulq_f32(
 963                                 vcvtq_f32_s32(cur_frac),
 964                                 one_over_fixed_one
 965                         ),
 966                         half
 967                 );
 968                 mul = vmulq_f32(sub, cur_fixed);
 969                 res = vaddq_f32(current, mul);
 970
 971                 /* Store back */
 972                 vst1q_f32(resampleCache, res);
 973
 974                 /* Update dCaches for next iteration */
 975                 cur_scalar += resampleStep * 4;
 976                 cur_scalar_1 += resampleStep * 4;
 977                 cur_scalar_2 += resampleStep * 4;
 978                 cur_scalar_3 += resampleStep * 4;
 979                 dCache = dCache + (cur_scalar >> FIXED_PRECISION);
 980                 dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION);
 981                 dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION);
 982                 dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION);
 983                 cur_scalar &= FIXED_FRACTION_MASK;
 984                 cur_scalar_1 &= FIXED_FRACTION_MASK;
 985                 cur_scalar_2 &= FIXED_FRACTION_MASK;
 986                 cur_scalar_3 &= FIXED_FRACTION_MASK;
 987
 988                 cur_frac = vaddq_s32(cur_frac, adder_frac_loop);
 989         }
 990         *resampleOffset += resampleStep * (toResample - tail);
 991
 992         /* This is the tail. */
 993         for (i = 0; i < tail; i += 1)
 994         {
 995                 /* lerp, then convert to float value */
 996                 *resampleCache++ = (float) (
 997                         dCache[0] +
 998                         (dCache[1] - dCache[0]) *
 999                         FIXED_TO_FLOAT(cur_scalar)
1000                 );
1001
1002                 /* Increment fraction offset by the stepping value */
1003                 *resampleOffset += resampleStep;
1004                 cur_scalar += resampleStep;
1005
1006                 /* Only increment the sample offset by integer values.
1007                  * Sometimes this will be 0 until cur accumulates
1008                  * enough steps, especially for "slow" rates.
1009                  */
1010                 dCache += (cur_scalar >> FIXED_PRECISION);
1011
1012                 /* Now that any integer has been added, drop it.
1013                  * The offset pointer will preserve the total.
1014                  */
1015                 cur_scalar &= FIXED_FRACTION_MASK;
1016         }
1017 }
1018
1019 void FAudio_INTERNAL_ResampleStereo_NEON(
1020         float *restrict dCache,
1021         float *restrict resampleCache,
1022         uint64_t *resampleOffset,
1023         uint64_t resampleStep,
1024         uint64_t toResample,
1025         uint8_t channels
1026 ) {
1027         uint32_t i, header, tail;
1028         uint64_t cur_scalar, cur_scalar_1;
1029         float *dCache_1;
1030         float32x4_t one_over_fixed_one, half, current, next, sub, cur_fixed, mul, res;
1031         int32x4_t cur_frac, adder_frac, adder_frac_loop;
1032
1033         /* This is the header, the Dest needs to be aligned to 16B */
1034         header = (16 - ((size_t) resampleCache) % 16) / 8;
1035         if (header == 2)
1036         {
1037                 header = 0;
1038         }
1039         cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
1040         for (i = 0; i < header; i += 2)
1041         {
1042                 /* lerp, then convert to float value */
1043                 *resampleCache++ = (float) (
1044                         dCache[0] +
1045                         (dCache[2] - dCache[0]) *
1046                         FIXED_TO_FLOAT(cur_scalar)
1047                 );
1048                 *resampleCache++ = (float) (
1049                         dCache[1] +
1050                         (dCache[3] - dCache[1]) *
1051                         FIXED_TO_FLOAT(cur_scalar)
1052                 );
1053
1054                 /* Increment fraction offset by the stepping value */
1055                 *resampleOffset += resampleStep;
1056                 cur_scalar += resampleStep;
1057
1058                 /* Only increment the sample offset by integer values.
1059                  * Sometimes this will be 0 until cur accumulates
1060                  * enough steps, especially for "slow" rates.
1061                  */
1062                 dCache += (cur_scalar >> FIXED_PRECISION) * 2;
1063
1064                 /* Now that any integer has been added, drop it.
1065                  * The offset pointer will preserve the total.
1066                  */
1067                 cur_scalar &= FIXED_FRACTION_MASK;
1068         }
1069
1070         toResample -= header;
1071
1072         /* initialising the varius cur.
1073          * cur_frac holds the fractional part of cur.
1074          * to avoid duplication please see the mono part for a thorough
1075          * explanation.
1076          */
1077         cur_frac = vdupq_n_s32(
1078                 (uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
1079         );
1080         int32_t __attribute__((aligned(16))) data[4] =
1081         {
1082                 0,
1083                 0,
1084                 (uint32_t) (resampleStep & FIXED_FRACTION_MASK),
1085                 (uint32_t) (resampleStep & FIXED_FRACTION_MASK)
1086         };
1087         adder_frac = vld1q_s32(data);
1088         cur_frac = vaddq_s32(cur_frac, adder_frac);
1089
1090         /* dCache_1 is the pointer for dcache in the next resample pos. */
1091         cur_scalar_1 = cur_scalar + resampleStep;
1092         dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2;
1093         cur_scalar_1 &= FIXED_FRACTION_MASK;
1094
1095         one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE);
1096         half = vdupq_n_f32(0.5f);
1097         adder_frac_loop = vdupq_n_s32(
1098                 (uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK)
1099         );
1100
1101         tail = toResample % 2;
1102         for (i = 0; i < toResample - tail; i += 2, resampleCache += 4)
1103         {
1104                 /* Current_next_1 and current_next_2 each holds 4 src
1105                  * sample points for getting 4 dest resample point at the end.
1106                  * current_next_1 holds:
1107                  * (current_ch_1, current_ch_2, next_ch_1, next_ch_2)
1108                  * for the first resample position, while current_next_2 holds
1109                  * the same for the 2nd resample position
1110                  */
1111                 current = vcombine_f32(
1112                         vld1_f32(dCache), /* A1B1 */
1113                         vld1_f32(dCache_1) /* A3B3 */
1114                 );
1115                 next = vcombine_f32(
1116                         vld1_f32(dCache + 2), /* A2B2 */
1117                         vld1_f32(dCache_1 + 2) /* A4B4 */
1118                 );
1119
1120                 sub = vsubq_f32(next, current);
1121
1122                 /* Adding the 0.5 back.
1123                  * See mono explanation for more elaborate explanation.
1124                  */
1125                 cur_fixed = vaddq_f32(
1126                         vmulq_f32(
1127                                 vcvtq_f32_s32(cur_frac),
1128                                 one_over_fixed_one
1129                         ),
1130                         half
1131                 );
1132                 mul = vmulq_f32(sub, cur_fixed);
1133                 res = vaddq_f32(current, mul);
1134
1135                 /* Store the results */
1136                 vst1q_f32(resampleCache, res);
1137
1138                 /* Update dCaches for next iteration */
1139                 cur_scalar += resampleStep * 2;
1140                 cur_scalar_1 += resampleStep * 2;
1141                 dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2;
1142                 dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2;
1143                 cur_scalar &= FIXED_FRACTION_MASK;
1144                 cur_scalar_1 &= FIXED_FRACTION_MASK;
1145
1146                 cur_frac = vaddq_s32(cur_frac, adder_frac_loop);
1147         }
1148         *resampleOffset += resampleStep * (toResample - tail);
1149
1150         /* This is the tail. */
1151         for (i = 0; i < tail; i += 1)
1152         {
1153                 /* lerp, then convert to float value */
1154                 *resampleCache++ = (float) (
1155                         dCache[0] +
1156                         (dCache[2] - dCache[0]) *
1157                         FIXED_TO_FLOAT(cur_scalar)
1158                 );
1159                 *resampleCache++ = (float) (
1160                         dCache[1] +
1161                         (dCache[3] - dCache[1]) *
1162                         FIXED_TO_FLOAT(cur_scalar)
1163                 );
1164
1165                 /* Increment fraction offset by the stepping value */
1166                 *resampleOffset += resampleStep;
1167                 cur_scalar += resampleStep;
1168
1169                 /* Only increment the sample offset by integer values.
1170                  * Sometimes this will be 0 until cur accumulates
1171                  * enough steps, especially for "slow" rates.
1172                  */
1173                 dCache += (cur_scalar >> FIXED_PRECISION) * 2;
1174
1175                 /* Now that any integer has been added, drop it.
1176                  * The offset pointer will preserve the total.
1177                  */
1178                 cur_scalar &= FIXED_FRACTION_MASK;
1179         }
1180 }
1181 #endif /* HAVE_NEON_INTRINSICS */
1182
1183 /* SECTION 3: Amplifiers */
1184
1185 #if NEED_SCALAR_CONVERTER_FALLBACKS
1186 void FAudio_INTERNAL_Amplify_Scalar(
1187         float* output,
1188         uint32_t totalSamples,
1189         float volume
1190 ) {
1191         uint32_t i;
1192         for (i = 0; i < totalSamples; i += 1)
1193         {
1194                 output[i] *= volume;
1195         }
1196 }
1197 #endif /* NEED_SCALAR_CONVERTER_FALLBACKS */
1198
1199 /* The SSE2 version of the amplifier comes from @8thMage! */
1200
1201 #if HAVE_SSE2_INTRINSICS
1202 void FAudio_INTERNAL_Amplify_SSE2(
1203         float* output,
1204         uint32_t totalSamples,
1205         float volume
1206 ) {
1207         uint32_t i;
1208         uint32_t header = (16 - (((size_t) output) % 16)) / 4;
1209         uint32_t tail = (totalSamples - header) % 4;
1210         __m128 volumeVec, outVec;
1211         if (header == 4)
1212         {
1213                 header = 0;
1214         }
1215         if (tail == 4)
1216         {
1217                 tail = 0;
1218         }
1219
1220         for (i = 0; i < header; i += 1)
1221         {
1222                 output[i] *= volume;
1223         }
1224
1225         volumeVec = _mm_set1_ps(volume);
1226         for (i = header; i < totalSamples - tail; i += 4)
1227         {
1228                 outVec = _mm_load_ps(output + i);
1229                 outVec = _mm_mul_ps(outVec, volumeVec);
1230                 _mm_store_ps(output + i, outVec);
1231         }
1232
1233         for (i = totalSamples - tail; i < totalSamples; i += 1)
1234         {
1235                 output[i] *= volume;
1236         }
1237 }
1238 #endif /* HAVE_SSE2_INTRINSICS */
1239
1240 #if HAVE_NEON_INTRINSICS
1241 void FAudio_INTERNAL_Amplify_NEON(
1242         float* output,
1243         uint32_t totalSamples,
1244         float volume
1245 ) {
1246         uint32_t i;
1247         uint32_t header = (16 - (((size_t) output) % 16)) / 4;
1248         uint32_t tail = (totalSamples - header) % 4;
1249         float32x4_t volumeVec, outVec;
1250         if (header == 4)
1251         {
1252                 header = 0;
1253         }
1254         if (tail == 4)
1255         {
1256                 tail = 0;
1257         }
1258
1259         for (i = 0; i < header; i += 1)
1260         {
1261                 output[i] *= volume;
1262         }
1263
1264         volumeVec = vdupq_n_f32(volume);
1265         for (i = header; i < totalSamples - tail; i += 4)
1266         {
1267                 outVec = vld1q_f32(output + i);
1268                 outVec = vmulq_f32(outVec, volumeVec);
1269                 vst1q_f32(output + i, outVec);
1270         }
1271
1272         for (i = totalSamples - tail; i < totalSamples; i += 1)
1273         {
1274                 output[i] *= volume;
1275         }
1276 }
1277 #endif /* HAVE_NEON_INTRINSICS */
1278
1279 /* SECTION 4: Mixer Functions */
1280
1281 void FAudio_INTERNAL_Mix_Generic_Scalar(
1282         uint32_t toMix,
1283         uint32_t srcChans,
1284         uint32_t dstChans,
1285         float *restrict src,
1286         float *restrict dst,
1287         float *restrict coefficients
1288 ) {
1289         uint32_t i, co, ci;
1290         for (i = 0; i < toMix; i += 1, src += srcChans, dst += dstChans)
1291         for (co = 0; co < dstChans; co += 1)
1292         {
1293                 for (ci = 0; ci < srcChans; ci += 1)
1294                 {
1295                         dst[co] += (
1296                                 src[ci] *
1297                                 coefficients[co * srcChans + ci]
1298                         );
1299                 }
1300         }
1301 }
1302
1303 #if HAVE_SSE2_INTRINSICS
1304 /* SSE horizontal add by Peter Cordes, CC-BY-SA.
1305  * From https://stackoverflow.com/a/35270026 */
1306 static inline float FAudio_simd_hadd(__m128 v)
1307 {
1308         __m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));
1309         __m128 sums = _mm_add_ps(v, shuf);
1310         shuf = _mm_movehl_ps(shuf, sums);
1311         sums = _mm_add_ss(sums, shuf);
1312         return _mm_cvtss_f32(sums);
1313 }
1314
1315 void FAudio_INTERNAL_Mix_Generic_SSE2(
1316         uint32_t toMix,
1317         uint32_t srcChans,
1318         uint32_t dstChans,
1319         float *restrict src,
1320         float *restrict dst,
1321         float *restrict coefficients
1322 ) {
1323         uint32_t i, co, ci;
1324         for (i = 0; i < toMix; i += 1, src += srcChans, dst += dstChans)
1325         for (co = 0; co < dstChans; co += 1)
1326         {
1327                 for (ci = 0; srcChans - ci >= 4; ci += 4)
1328                 {
1329                         /* do SIMD */
1330                         const __m128 vols = _mm_loadu_ps(&coefficients[co * srcChans + ci]);
1331                         const __m128 dat = _mm_loadu_ps(&src[ci]);
1332                         dst[co] += FAudio_simd_hadd(_mm_mul_ps(dat, vols));
1333                 }
1334
1335                 for (; ci < srcChans; ci += 1)
1336                 {
1337                         /* do scalar */
1338                         dst[co] += (
1339                                 src[ci] *
1340                                 coefficients[co * srcChans + ci]
1341                         );
1342                 }
1343         }
1344 }
1345 #endif /* HAVE_SSE2_INTRINSICS */
1346
1347 void FAudio_INTERNAL_Mix_1in_1out_Scalar(
1348         uint32_t toMix,
1349         uint32_t UNUSED1,
1350         uint32_t UNUSED2,
1351         float *restrict src,
1352         float *restrict dst,
1353         float *restrict coefficients
1354 ) {
1355         uint32_t i;
1356         for (i = 0; i < toMix; i += 1, src += 1, dst += 1)
1357         {
1358                 /* Base source data, combined with the coefficients */
1359                 dst[0] += src[0] * coefficients[0];
1360         }
1361 }
1362
1363 void FAudio_INTERNAL_Mix_1in_2out_Scalar(
1364         uint32_t toMix,
1365         uint32_t UNUSED1,
1366         uint32_t UNUSED2,
1367         float *restrict src,
1368         float *restrict dst,
1369         float *restrict coefficients
1370 ) {
1371         uint32_t i;
1372         for (i = 0; i < toMix; i += 1, src += 1, dst += 2)
1373         {
1374                 dst[0] += src[0] * coefficients[0];
1375                 dst[1] += src[0] * coefficients[1];
1376         }
1377 }
1378
1379 void FAudio_INTERNAL_Mix_1in_6out_Scalar(
1380         uint32_t toMix,
1381         uint32_t UNUSED1,
1382         uint32_t UNUSED2,
1383         float *restrict src,
1384         float *restrict dst,
1385         float *restrict coefficients
1386 ) {
1387         uint32_t i;
1388         for (i = 0; i < toMix; i += 1, src += 1, dst += 6)
1389         {
1390                 dst[0] += src[0] * coefficients[0];
1391                 dst[1] += src[0] * coefficients[1];
1392                 dst[2] += src[0] * coefficients[2];
1393                 dst[3] += src[0] * coefficients[3];
1394                 dst[4] += src[0] * coefficients[4];
1395                 dst[5] += src[0] * coefficients[5];
1396         }
1397 }
1398
1399 void FAudio_INTERNAL_Mix_1in_8out_Scalar(
1400         uint32_t toMix,
1401         uint32_t UNUSED1,
1402         uint32_t UNUSED2,
1403         float *restrict src,
1404         float *restrict dst,
1405         float *restrict coefficients
1406 ) {
1407         uint32_t i;
1408         for (i = 0; i < toMix; i += 1, src += 1, dst += 8)
1409         {
1410                 dst[0] += src[0] * coefficients[0];
1411                 dst[1] += src[0] * coefficients[1];
1412                 dst[2] += src[0] * coefficients[2];
1413                 dst[3] += src[0] * coefficients[3];
1414                 dst[4] += src[0] * coefficients[4];
1415                 dst[5] += src[0] * coefficients[5];
1416                 dst[6] += src[0] * coefficients[6];
1417                 dst[7] += src[0] * coefficients[7];
1418         }
1419 }
1420
1421 void FAudio_INTERNAL_Mix_2in_1out_Scalar(
1422         uint32_t toMix,
1423         uint32_t UNUSED1,
1424         uint32_t UNUSED2,
1425         float *restrict src,
1426         float *restrict dst,
1427         float *restrict coefficients
1428 ) {
1429         uint32_t i;
1430         for (i = 0; i < toMix; i += 1, src += 2, dst += 1)
1431         {
1432                 /* Base source data, combined with the coefficients */
1433                 dst[0] += (
1434                         (src[0] * coefficients[0]) +
1435                         (src[1] * coefficients[1])
1436                 );
1437         }
1438 }
1439
1440 void FAudio_INTERNAL_Mix_2in_2out_Scalar(
1441         uint32_t toMix,
1442         uint32_t UNUSED1,
1443         uint32_t UNUSED2,
1444         float *restrict src,
1445         float *restrict dst,
1446         float *restrict coefficients
1447 ) {
1448         uint32_t i;
1449         for (i = 0; i < toMix; i += 1, src += 2, dst += 2)
1450         {
1451                 dst[0] += (
1452                         (src[0] * coefficients[0]) +
1453                         (src[1] * coefficients[1])
1454                 );
1455                 dst[1] += (
1456                         (src[0] * coefficients[2]) +
1457                         (src[1] * coefficients[3])
1458                 );
1459         }
1460 }
1461
1462 void FAudio_INTERNAL_Mix_2in_6out_Scalar(
1463         uint32_t toMix,
1464         uint32_t UNUSED1,
1465         uint32_t UNUSED2,
1466         float *restrict src,
1467         float *restrict dst,
1468         float *restrict coefficients
1469 ) {
1470         uint32_t i;
1471         for (i = 0; i < toMix; i += 1, src += 2, dst += 6)
1472         {
1473                 dst[0] += (
1474                         (src[0] * coefficients[0]) +
1475                         (src[1] * coefficients[1])
1476                 );
1477                 dst[1] += (
1478                         (src[0] * coefficients[2]) +
1479                         (src[1] * coefficients[3])
1480                 );
1481                 dst[2] += (
1482                         (src[0] * coefficients[4]) +
1483                         (src[1] * coefficients[5])
1484                 );
1485                 dst[3] += (
1486                         (src[0] * coefficients[6]) +
1487                         (src[1] * coefficients[7])
1488                 );
1489                 dst[4] += (
1490                         (src[0] * coefficients[8]) +
1491                         (src[1] * coefficients[9])
1492                 );
1493                 dst[5] += (
1494                         (src[0] * coefficients[10]) +
1495                         (src[1] * coefficients[11])
1496                 );
1497         }
1498 }
1499
1500 void FAudio_INTERNAL_Mix_2in_8out_Scalar(
1501         uint32_t toMix,
1502         uint32_t UNUSED1,
1503         uint32_t UNUSED2,
1504         float *restrict src,
1505         float *restrict dst,
1506         float *restrict coefficients
1507 ) {
1508         uint32_t i;
1509         for (i = 0; i < toMix; i += 1, src += 2, dst += 8)
1510         {
1511                 dst[0] += (
1512                         (src[0] * coefficients[0]) +
1513                         (src[1] * coefficients[1])
1514                 );
1515                 dst[1] += (
1516                         (src[0] * coefficients[2]) +
1517                         (src[1] * coefficients[3])
1518                 );
1519                 dst[2] += (
1520                         (src[0] * coefficients[4]) +
1521                         (src[1] * coefficients[5])
1522                 );
1523                 dst[3] += (
1524                         (src[0] * coefficients[6]) +
1525                         (src[1] * coefficients[7])
1526                 );
1527                 dst[4] += (
1528                         (src[0] * coefficients[8]) +
1529                         (src[1] * coefficients[9])
1530                 );
1531                 dst[5] += (
1532                         (src[0] * coefficients[10]) +
1533                         (src[1] * coefficients[11])
1534                 );
1535                 dst[6] += (
1536                         (src[0] * coefficients[12]) +
1537                         (src[1] * coefficients[13])
1538                 );
1539                 dst[7] += (
1540                         (src[0] * coefficients[14]) +
1541                         (src[1] * coefficients[15])
1542                 );
1543         }
1544 }
1545
1546 /* SECTION 5: InitSIMDFunctions. Assigns based on SSE2/NEON support. */
1547
1548 void (*FAudio_INTERNAL_Convert_U8_To_F32)(
1549         const uint8_t *restrict src,
1550         float *restrict dst,
1551         uint32_t len
1552 );
1553 void (*FAudio_INTERNAL_Convert_S16_To_F32)(
1554         const int16_t *restrict src,
1555         float *restrict dst,
1556         uint32_t len
1557 );
1558 void (*FAudio_INTERNAL_Convert_S32_To_F32)(
1559         const int32_t *restrict src,
1560         float *restrict dst,
1561         uint32_t len
1562 );
1563
1564 FAudioResampleCallback FAudio_INTERNAL_ResampleMono;
1565 FAudioResampleCallback FAudio_INTERNAL_ResampleStereo;
1566
1567 void (*FAudio_INTERNAL_Amplify)(
1568         float *output,
1569         uint32_t totalSamples,
1570         float volume
1571 );
1572
1573 FAudioMixCallback FAudio_INTERNAL_Mix_Generic;
1574
1575 void FAudio_INTERNAL_InitSIMDFunctions(uint8_t hasSSE2, uint8_t hasNEON)
1576 {
1577 #if HAVE_SSE2_INTRINSICS
1578         if (hasSSE2)
1579         {
1580                 FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_SSE2;
1581                 FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_SSE2;
1582                 FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_SSE2;
1583                 FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_SSE2;
1584                 FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_SSE2;
1585                 FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_SSE2;
1586                 FAudio_INTERNAL_Mix_Generic = FAudio_INTERNAL_Mix_Generic_SSE2;
1587                 return;
1588         }
1589 #endif
1590 #if HAVE_NEON_INTRINSICS
1591         if (hasNEON)
1592         {
1593                 FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_NEON;
1594                 FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_NEON;
1595                 FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_NEON;
1596                 FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_NEON;
1597                 FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_NEON;
1598                 FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_NEON;
1599                 FAudio_INTERNAL_Mix_Generic = FAudio_INTERNAL_Mix_Generic_Scalar;
1600                 return;
1601         }
1602 #endif
1603 #if NEED_SCALAR_CONVERTER_FALLBACKS
1604         FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_Scalar;
1605         FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_Scalar;
1606         FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_Scalar;
1607         FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_Scalar;
1608         FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_Scalar;
1609         FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_Scalar;
1610         FAudio_INTERNAL_Mix_Generic = FAudio_INTERNAL_Mix_Generic_Scalar;
1611 #else
1612         FAudio_assert(0 && "Need converter functions!");
1613 #endif
1614 }
1615
1616 /* vim: set noexpandtab shiftwidth=8 tabstop=8: */