faudio: Import upstream release 24.02.
[wine.git] / libs / faudio / src / FAudio_internal_simd.c
1 /* FAudio - XAudio Reimplementation for FNA
3 * Copyright (c) 2011-2024 Ethan Lee, Luigi Auriemma, and the MonoGame Team
5 * This software is provided 'as-is', without any express or implied warranty.
6 * In no event will the authors be held liable for any damages arising from
7 * the use of this software.
9 * Permission is granted to anyone to use this software for any purpose,
10 * including commercial applications, and to alter it and redistribute it
11 * freely, subject to the following restrictions:
13 * 1. The origin of this software must not be misrepresented; you must not
14 * claim that you wrote the original software. If you use this software in a
15 * product, an acknowledgment in the product documentation would be
16 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
23 * Ethan "flibitijibibo" Lee <flibitijibibo@flibitijibibo.com>
27 #include "FAudio_internal.h"
29 /* SECTION 0: SSE/NEON Detection */
31 /* The SSE/NEON detection comes from MojoAL:
32 * https://hg.icculus.org/icculus/mojoAL/file/default/mojoal.c
35 #if defined(__x86_64__) || defined(_M_X64)
36 /* Some platforms fail to define this... */
37 #ifndef __SSE2__
38 #define __SSE2__ 1
39 #endif
41 /* x86_64 guarantees SSE2. */
43 #elif defined(__aarch64__) || defined(_M_ARM64)
44 /* Some platforms fail to define this... */
45 #ifndef __ARM_NEON__
46 #define __ARM_NEON__ 1
47 #endif
49 /* AArch64 guarantees NEON. */
51 #elif __MACOSX__ && !defined(__POWERPC__)
52 /* Some build systems may need to specify this. */
53 #if !defined(__SSE2__) && !defined(__ARM_NEON__)
54 #error macOS does not have SSE2/NEON? Bad compiler?
55 #endif
57 /* Mac OS X/Intel guarantees SSE2. */
59 #else
60 /* Need plain C implementations to support all other hardware */
62 #endif
64 /* Our NEON paths require AArch64, don't check __ARM_NEON__ here */
65 #if defined(__aarch64__) || defined(_M_ARM64)
66 #include <arm_neon.h>
68 #endif
71 #ifdef __SSE2__
72 #include <emmintrin.h>
74 #endif
76 /* SECTION 1: Type Converters */
78 /* The SSE/NEON converters are based on SDL_audiotypecvt:
79 * https://hg.libsdl.org/SDL/file/default/src/audio/SDL_audiotypecvt.c
82 #define DIVBY128 0.0078125f
83 #define DIVBY32768 0.000030517578125f
84 #define DIVBY8388607 0.00000011920930376163766f
87 void FAudio_INTERNAL_Convert_U8_To_F32_Scalar(
88 const uint8_t *restrict src,
89 float *restrict dst,
90 uint32_t len
91 ) {
92 uint32_t i;
93 for (i = 0; i < len; i += 1)
95 *dst++ = (*src++ * DIVBY128) - 1.0f;
99 void FAudio_INTERNAL_Convert_S16_To_F32_Scalar(
100 const int16_t *restrict src,
101 float *restrict dst,
102 uint32_t len
104 uint32_t i;
105 for (i = 0; i < len; i += 1)
107 *dst++ = *src++ * DIVBY32768;
111 void FAudio_INTERNAL_Convert_S32_To_F32_Scalar(
112 const int32_t *restrict src,
113 float *restrict dst,
114 uint32_t len
116 uint32_t i;
117 for (i = 0; i < len; i += 1)
119 *dst++ = (*src++ >> 8) * DIVBY8388607;
125 void FAudio_INTERNAL_Convert_U8_To_F32_SSE2(
126 const uint8_t *restrict src,
127 float *restrict dst,
128 uint32_t len
130 int i;
131 src += len - 1;
132 dst += len - 1;
134 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
135 for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
136 *dst = (((float) *src) * DIVBY128) - 1.0f;
139 src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */
140 FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
142 /* Make sure src is aligned too. */
143 if ((((size_t) src) & 15) == 0) {
144 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
145 const __m128i *mmsrc = (const __m128i *) src;
146 const __m128i zero = _mm_setzero_si128();
147 const __m128 divby128 = _mm_set1_ps(DIVBY128);
148 const __m128 minus1 = _mm_set1_ps(-1.0f);
149 while (i >= 16) { /* 16 * 8-bit */
150 const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 uint8 into an XMM register. */
151 /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
152 const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
153 /* right-shift-zero-extend gets us uint16 with the other set of values. */
154 const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
155 /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
156 /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
157 const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
158 const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
159 const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
160 const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
161 /* Interleave back into correct order, store. */
162 _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
163 _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
164 _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
165 _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
166 i -= 16; mmsrc--; dst -= 16;
169 src = (const uint8_t *) mmsrc;
172 src += 15; dst += 15; /* adjust for any scalar finishing. */
174 /* Finish off any leftovers with scalar operations. */
175 while (i) {
176 *dst = (((float) *src) * DIVBY128) - 1.0f;
177 i--; src--; dst--;
181 void FAudio_INTERNAL_Convert_S16_To_F32_SSE2(
182 const int16_t *restrict src,
183 float *restrict dst,
184 uint32_t len
186 int i;
187 src += len - 1;
188 dst += len - 1;
190 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
191 for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
192 *dst = ((float) *src) * DIVBY32768;
195 src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */
196 FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
198 /* Make sure src is aligned too. */
199 if ((((size_t) src) & 15) == 0) {
200 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
201 const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
202 while (i >= 8) { /* 8 * 16-bit */
203 const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */
204 /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
205 const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
206 /* right-shift-sign-extend gets us sint32 with the other set of values. */
207 const __m128i b = _mm_srai_epi32(ints, 16);
208 /* Interleave these back into the right order, convert to float, multiply, store. */
209 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
210 _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
211 i -= 8; src -= 8; dst -= 8;
215 src += 7; dst += 7; /* adjust for any scalar finishing. */
217 /* Finish off any leftovers with scalar operations. */
218 while (i) {
219 *dst = ((float) *src) * DIVBY32768;
220 i--; src--; dst--;
224 void FAudio_INTERNAL_Convert_S32_To_F32_SSE2(
225 const int32_t *restrict src,
226 float *restrict dst,
227 uint32_t len
229 int i;
231 /* Get dst aligned to 16 bytes */
232 for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) {
233 *dst = ((float) (*src>>8)) * DIVBY8388607;
236 FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
238 /* Make sure src is aligned too. */
239 if ((((size_t) src) & 15) == 0) {
240 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
241 const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
242 const __m128i *mmsrc = (const __m128i *) src;
243 while (i >= 4) { /* 4 * sint32 */
244 /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
245 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
246 i -= 4; mmsrc++; dst += 4;
248 src = (const int32_t *) mmsrc;
251 /* Finish off any leftovers with scalar operations. */
252 while (i) {
253 *dst = ((float) (*src>>8)) * DIVBY8388607;
254 i--; src++; dst++;
257 #endif /* HAVE_SSE2_INTRINSICS */
260 void FAudio_INTERNAL_Convert_U8_To_F32_NEON(
261 const uint8_t *restrict src,
262 float *restrict dst,
263 uint32_t len
265 int i;
266 src += len - 1;
267 dst += len - 1;
269 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
270 for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
271 *dst = (((float) *src) * DIVBY128) - 1.0f;
274 src -= 15; dst -= 15; /* adjust to read NEON blocks from the start. */
275 FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
277 /* Make sure src is aligned too. */
278 if ((((size_t) src) & 15) == 0) {
279 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
280 const uint8_t *mmsrc = (const uint8_t *) src;
281 const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
282 const float32x4_t negone = vdupq_n_f32(-1.0f);
283 while (i >= 16) { /* 16 * 8-bit */
284 const uint8x16_t bytes = vld1q_u8(mmsrc); /* get 16 uint8 into a NEON register. */
285 const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes)); /* convert top 8 bytes to 8 uint16 */
286 const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes)); /* convert bottom 8 bytes to 8 uint16 */
287 /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
288 vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128));
289 vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128));
290 vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128));
291 vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128));
292 i -= 16; mmsrc -= 16; dst -= 16;
295 src = (const uint8_t *) mmsrc;
298 src += 15; dst += 15; /* adjust for any scalar finishing. */
300 /* Finish off any leftovers with scalar operations. */
301 while (i) {
302 *dst = (((float) *src) * DIVBY128) - 1.0f;
303 i--; src--; dst--;
307 void FAudio_INTERNAL_Convert_S16_To_F32_NEON(
308 const int16_t *restrict src,
309 float *restrict dst,
310 uint32_t len
312 int i;
313 src += len - 1;
314 dst += len - 1;
316 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
317 for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
318 *dst = ((float) *src) * DIVBY32768;
321 src -= 7; dst -= 7; /* adjust to read NEON blocks from the start. */
322 FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
324 /* Make sure src is aligned too. */
325 if ((((size_t) src) & 15) == 0) {
326 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
327 const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
328 while (i >= 8) { /* 8 * 16-bit */
329 const int16x8_t ints = vld1q_s16((int16_t const *) src); /* get 8 sint16 into a NEON register. */
330 /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
331 vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));
332 vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));
333 i -= 8; src -= 8; dst -= 8;
337 src += 7; dst += 7; /* adjust for any scalar finishing. */
339 /* Finish off any leftovers with scalar operations. */
340 while (i) {
341 *dst = ((float) *src) * DIVBY32768;
342 i--; src--; dst--;
346 void FAudio_INTERNAL_Convert_S32_To_F32_NEON(
347 const int32_t *restrict src,
348 float *restrict dst,
349 uint32_t len
351 int i;
353 /* Get dst aligned to 16 bytes */
354 for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) {
355 *dst = ((float) (*src>>8)) * DIVBY8388607;
358 FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
360 /* Make sure src is aligned too. */
361 if ((((size_t) src) & 15) == 0) {
362 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
363 const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);
364 const int32_t *mmsrc = (const int32_t *) src;
365 while (i >= 4) { /* 4 * sint32 */
366 /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
367 vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));
368 i -= 4; mmsrc += 4; dst += 4;
370 src = (const int32_t *) mmsrc;
373 /* Finish off any leftovers with scalar operations. */
374 while (i) {
375 *dst = ((float) (*src>>8)) * DIVBY8388607;
376 i--; src++; dst++;
379 #endif /* HAVE_NEON_INTRINSICS */
381 /* SECTION 2: Linear Resamplers */
383 void FAudio_INTERNAL_ResampleGeneric(
384 float *restrict dCache,
385 float *restrict resampleCache,
386 uint64_t *resampleOffset,
387 uint64_t resampleStep,
388 uint64_t toResample,
389 uint8_t channels
391 uint32_t i, j;
392 uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;
393 for (i = 0; i < toResample; i += 1)
395 for (j = 0; j < channels; j += 1)
397 /* lerp, then convert to float value */
398 *resampleCache++ = (float) (
399 dCache[j] +
400 (dCache[j + channels] - dCache[j]) *
405 /* Increment fraction offset by the stepping value */
406 *resampleOffset += resampleStep;
407 cur += resampleStep;
409 /* Only increment the sample offset by integer values.
410 * Sometimes this will be 0 until cur accumulates
411 * enough steps, especially for "slow" rates.
413 dCache += (cur >> FIXED_PRECISION) * channels;
415 /* Now that any integer has been added, drop it.
416 * The offset pointer will preserve the total.
423 void FAudio_INTERNAL_ResampleMono_Scalar(
424 float *restrict dCache,
425 float *restrict resampleCache,
426 uint64_t *resampleOffset,
427 uint64_t resampleStep,
428 uint64_t toResample,
429 uint8_t UNUSED
431 uint32_t i;
432 uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;
433 for (i = 0; i < toResample; i += 1)
435 /* lerp, then convert to float value */
436 *resampleCache++ = (float) (
437 dCache[0] +
438 (dCache[1] - dCache[0]) *
442 /* Increment fraction offset by the stepping value */
443 *resampleOffset += resampleStep;
444 cur += resampleStep;
446 /* Only increment the sample offset by integer values.
447 * Sometimes this will be 0 until cur accumulates
448 * enough steps, especially for "slow" rates.
450 dCache += (cur >> FIXED_PRECISION);
452 /* Now that any integer has been added, drop it.
453 * The offset pointer will preserve the total.
459 void FAudio_INTERNAL_ResampleStereo_Scalar(
460 float *restrict dCache,
461 float *restrict resampleCache,
462 uint64_t *resampleOffset,
463 uint64_t resampleStep,
464 uint64_t toResample,
465 uint8_t UNUSED
467 uint32_t i;
468 uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;
469 for (i = 0; i < toResample; i += 1)
471 /* lerp, then convert to float value */
472 *resampleCache++ = (float) (
473 dCache[0] +
474 (dCache[2] - dCache[0]) *
477 *resampleCache++ = (float) (
478 dCache[1] +
479 (dCache[3] - dCache[1]) *
483 /* Increment fraction offset by the stepping value */
484 *resampleOffset += resampleStep;
485 cur += resampleStep;
487 /* Only increment the sample offset by integer values.
488 * Sometimes this will be 0 until cur accumulates
489 * enough steps, especially for "slow" rates.
491 dCache += (cur >> FIXED_PRECISION) * 2;
493 /* Now that any integer has been added, drop it.
494 * The offset pointer will preserve the total.
501 /* The SSE2 versions of the resamplers come from @8thMage! */
504 void FAudio_INTERNAL_ResampleMono_SSE2(
505 float *restrict dCache,
506 float *restrict resampleCache,
507 uint64_t *resampleOffset,
508 uint64_t resampleStep,
509 uint64_t toResample,
510 uint8_t UNUSED
512 uint32_t i, header, tail;
513 uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3;
514 float *dCache_1, *dCache_2, *dCache_3;
515 uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
516 __m128 one_over_fixed_one, half, current_next_0_1, current_next_2_3,
517 current, next, sub, cur_fixed, mul, res;
518 __m128i cur_frac, adder_frac, adder_frac_loop;
520 /* This is the header, the Dest needs to be aligned to 16B */
521 header = (16 - ((size_t) resampleCache) % 16) / 4;
522 if (header == 4)
524 header = 0;
526 for (i = 0; i < header; i += 1)
528 /* lerp, then convert to float value */
529 *resampleCache++ = (float) (
530 dCache[0] +
531 (dCache[1] - dCache[0]) *
532 FIXED_TO_FLOAT(cur_scalar)
535 /* Increment fraction offset by the stepping value */
536 *resampleOffset += resampleStep;
537 cur_scalar += resampleStep;
539 /* Only increment the sample offset by integer values.
540 * Sometimes this will be 0 until cur accumulates
541 * enough steps, especially for "slow" rates.
543 dCache += (cur_scalar >> FIXED_PRECISION);
545 /* Now that any integer has been added, drop it.
546 * The offset pointer will preserve the total.
548 cur_scalar &= FIXED_FRACTION_MASK;
551 toResample -= header;
553 /* initialising the varius cur
554 * cur_frac is the fractional part of cur with 4 samples. as the
555 * fractional part is 32 bit unsigned value, it can be just added
556 * and the modulu operation for keeping the fractional part will be implicit.
557 * the 0.5 is for converting signed values to float (no unsigned convert),
558 * the 0.5 is added later.
560 cur_frac = _mm_set1_epi32(
561 (uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
563 adder_frac = _mm_setr_epi32(
565 (uint32_t) (resampleStep & FIXED_FRACTION_MASK),
566 (uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK),
567 (uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK)
569 cur_frac = _mm_add_epi32(cur_frac, adder_frac);
571 /* The various cur_scalar is for the different samples
572 * (1, 2, 3 compared to original cur_scalar = 0)
574 cur_scalar_1 = cur_scalar + resampleStep;
575 cur_scalar_2 = cur_scalar + resampleStep * 2;
576 cur_scalar_3 = cur_scalar + resampleStep * 3;
577 dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION);
578 dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION);
579 dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION);
580 cur_scalar &= FIXED_FRACTION_MASK;
581 cur_scalar_1 &= FIXED_FRACTION_MASK;
582 cur_scalar_2 &= FIXED_FRACTION_MASK;
583 cur_scalar_3 &= FIXED_FRACTION_MASK;
585 /* FIXME: These should be _mm_undefined_ps! */
586 current_next_0_1 = _mm_setzero_ps();
587 current_next_2_3 = _mm_setzero_ps();
589 /* Constants */
590 one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE);
591 half = _mm_set1_ps(0.5f);
592 adder_frac_loop = _mm_set1_epi32(
593 (uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK)
596 tail = toResample % 4;
597 for (i = 0; i < toResample - tail; i += 4, resampleCache += 4)
599 /* current next holds 2 pairs of the sample and the sample + 1
600 * after that need to seperate them.
603 current_next_0_1 = _mm_loadl_pi(current_next_0_1, (__m64*) dCache);
604 current_next_0_1 = _mm_loadh_pi(current_next_0_1, (__m64*) dCache_1);
605 current_next_2_3 = _mm_loadl_pi(current_next_2_3, (__m64*) dCache_2);
606 current_next_2_3 = _mm_loadh_pi(current_next_2_3, (__m64*) dCache_3);
608 /* Unpack them to have seperate current and next in 2 vectors. */
609 current = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0x88); /* 0b1000 */
610 next = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0xdd); /* 0b1101 */
612 sub = _mm_sub_ps(next, current);
614 /* Convert the fractional part to float and then mul to get the fractions out.
615 * then add back the 0.5 we subtracted before.
617 cur_fixed = _mm_add_ps(
618 _mm_mul_ps(
619 _mm_cvtepi32_ps(cur_frac),
620 one_over_fixed_one
622 half
624 mul = _mm_mul_ps(sub, cur_fixed);
625 res = _mm_add_ps(current, mul);
627 /* Store back */
628 _mm_store_ps(resampleCache, res);
630 /* Update dCaches for next iteration */
631 cur_scalar += resampleStep * 4;
632 cur_scalar_1 += resampleStep * 4;
633 cur_scalar_2 += resampleStep * 4;
634 cur_scalar_3 += resampleStep * 4;
635 dCache = dCache + (cur_scalar >> FIXED_PRECISION);
636 dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION);
637 dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION);
638 dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION);
639 cur_scalar &= FIXED_FRACTION_MASK;
640 cur_scalar_1 &= FIXED_FRACTION_MASK;
641 cur_scalar_2 &= FIXED_FRACTION_MASK;
642 cur_scalar_3 &= FIXED_FRACTION_MASK;
644 cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop);
646 *resampleOffset += resampleStep * (toResample - tail);
648 /* This is the tail. */
649 for (i = 0; i < tail; i += 1)
651 /* lerp, then convert to float value */
652 *resampleCache++ = (float) (
653 dCache[0] +
654 (dCache[1] - dCache[0]) *
655 FIXED_TO_FLOAT(cur_scalar)
658 /* Increment fraction offset by the stepping value */
659 *resampleOffset += resampleStep;
660 cur_scalar += resampleStep;
662 /* Only increment the sample offset by integer values.
663 * Sometimes this will be 0 until cur accumulates
664 * enough steps, especially for "slow" rates.
666 dCache += (cur_scalar >> FIXED_PRECISION);
668 /* Now that any integer has been added, drop it.
669 * The offset pointer will preserve the total.
671 cur_scalar &= FIXED_FRACTION_MASK;
675 void FAudio_INTERNAL_ResampleStereo_SSE2(
676 float *restrict dCache,
677 float *restrict resampleCache,
678 uint64_t *resampleOffset,
679 uint64_t resampleStep,
680 uint64_t toResample,
681 uint8_t UNUSED
683 uint32_t i, header, tail;
684 uint64_t cur_scalar, cur_scalar_1;
685 float *dCache_1;
686 __m128 one_over_fixed_one, half, current_next_1, current_next_2,
687 current, next, sub, cur_fixed, mul, res;
688 __m128i cur_frac, adder_frac, adder_frac_loop;
690 /* This is the header, the Dest needs to be aligned to 16B */
691 header = (16 - ((size_t) resampleCache) % 16) / 8;
692 if (header == 2)
694 header = 0;
696 cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
697 for (i = 0; i < header; i += 2)
699 /* lerp, then convert to float value */
700 *resampleCache++ = (float) (
701 dCache[0] +
702 (dCache[2] - dCache[0]) *
703 FIXED_TO_FLOAT(cur_scalar)
705 *resampleCache++ = (float) (
706 dCache[1] +
707 (dCache[3] - dCache[1]) *
708 FIXED_TO_FLOAT(cur_scalar)
711 /* Increment fraction offset by the stepping value */
712 *resampleOffset += resampleStep;
713 cur_scalar += resampleStep;
715 /* Only increment the sample offset by integer values.
716 * Sometimes this will be 0 until cur accumulates
717 * enough steps, especially for "slow" rates.
719 dCache += (cur_scalar >> FIXED_PRECISION) * 2;
721 /* Now that any integer has been added, drop it.
722 * The offset pointer will preserve the total.
724 cur_scalar &= FIXED_FRACTION_MASK;
727 toResample -= header;
729 /* initialising the varius cur.
730 * cur_frac holds the fractional part of cur.
731 * to avoid duplication please see the mono part for a thorough
732 * explanation.
734 cur_frac = _mm_set1_epi32(
735 (uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
737 adder_frac = _mm_setr_epi32(
740 (uint32_t) (resampleStep & FIXED_FRACTION_MASK),
741 (uint32_t) (resampleStep & FIXED_FRACTION_MASK)
743 cur_frac = _mm_add_epi32(cur_frac, adder_frac);
745 /* dCache_1 is the pointer for dcache in the next resample pos. */
746 cur_scalar_1 = cur_scalar + resampleStep;
747 dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2;
748 cur_scalar_1 &= FIXED_FRACTION_MASK;
750 one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE);
751 half = _mm_set1_ps(0.5f);
752 adder_frac_loop = _mm_set1_epi32(
753 (uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK)
756 tail = toResample % 2;
757 for (i = 0; i < toResample - tail; i += 2, resampleCache += 4)
759 /* Current_next_1 and current_next_2 each holds 4 src
760 * sample points for getting 4 dest resample point at the end.
761 * current_next_1 holds:
762 * (current_ch_1, current_ch_2, next_ch_1, next_ch_2)
763 * for the first resample position, while current_next_2 holds
764 * the same for the 2nd resample position
766 current_next_1 = _mm_loadu_ps(dCache); /* A1B1A2B2 */
767 current_next_2 = _mm_loadu_ps(dCache_1); /* A3B3A4B4 */
769 /* Unpack them to get the current and the next in seperate vectors. */
770 current = _mm_castpd_ps(
771 _mm_unpacklo_pd(
772 _mm_castps_pd(current_next_1),
773 _mm_castps_pd(current_next_2)
776 next = _mm_castpd_ps(
777 _mm_unpackhi_pd(
778 _mm_castps_pd(current_next_1),
779 _mm_castps_pd(current_next_2)
783 sub = _mm_sub_ps(next, current);
785 /* Adding the 0.5 back.
786 * See mono explanation for more elaborate explanation.
788 cur_fixed = _mm_add_ps(
789 _mm_mul_ps(
790 _mm_cvtepi32_ps(cur_frac),
791 one_over_fixed_one
793 half
795 mul = _mm_mul_ps(sub, cur_fixed);
796 res = _mm_add_ps(current, mul);
798 /* Store the results */
799 _mm_store_ps(resampleCache, res);
801 /* Update dCaches for next iteration */
802 cur_scalar += resampleStep * 2;
803 cur_scalar_1 += resampleStep * 2;
804 dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2;
805 dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2;
806 cur_scalar &= FIXED_FRACTION_MASK;
807 cur_scalar_1 &= FIXED_FRACTION_MASK;
809 cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop);
811 *resampleOffset += resampleStep * (toResample - tail);
813 /* This is the tail. */
814 for (i = 0; i < tail; i += 1)
816 /* lerp, then convert to float value */
817 *resampleCache++ = (float) (
818 dCache[0] +
819 (dCache[2] - dCache[0]) *
820 FIXED_TO_FLOAT(cur_scalar)
822 *resampleCache++ = (float) (
823 dCache[1] +
824 (dCache[3] - dCache[1]) *
825 FIXED_TO_FLOAT(cur_scalar)
828 /* Increment fraction offset by the stepping value */
829 *resampleOffset += resampleStep;
830 cur_scalar += resampleStep;
832 /* Only increment the sample offset by integer values.
833 * Sometimes this will be 0 until cur accumulates
834 * enough steps, especially for "slow" rates.
836 dCache += (cur_scalar >> FIXED_PRECISION) * 2;
838 /* Now that any integer has been added, drop it.
839 * The offset pointer will preserve the total.
841 cur_scalar &= FIXED_FRACTION_MASK;
844 #endif /* HAVE_SSE2_INTRINSICS */
847 void FAudio_INTERNAL_ResampleMono_NEON(
848 float *restrict dCache,
849 float *restrict resampleCache,
850 uint64_t *resampleOffset,
851 uint64_t resampleStep,
852 uint64_t toResample,
853 uint8_t UNUSED
855 uint32_t i, header, tail;
856 uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3;
857 float *dCache_1, *dCache_2, *dCache_3;
858 uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
859 float32x4_t one_over_fixed_one, half, current_next_0_1, current_next_2_3,
860 current, next, sub, cur_fixed, mul, res;
861 int32x4_t cur_frac, adder_frac, adder_frac_loop;
863 /* This is the header, the Dest needs to be aligned to 16B */
864 header = (16 - ((size_t) resampleCache) % 16) / 4;
865 if (header == 4)
867 header = 0;
869 for (i = 0; i < header; i += 1)
871 /* lerp, then convert to float value */
872 *resampleCache++ = (float) (
873 dCache[0] +
874 (dCache[1] - dCache[0]) *
875 FIXED_TO_FLOAT(cur_scalar)
878 /* Increment fraction offset by the stepping value */
879 *resampleOffset += resampleStep;
880 cur_scalar += resampleStep;
882 /* Only increment the sample offset by integer values.
883 * Sometimes this will be 0 until cur accumulates
884 * enough steps, especially for "slow" rates.
886 dCache += (cur_scalar >> FIXED_PRECISION);
888 /* Now that any integer has been added, drop it.
889 * The offset pointer will preserve the total.
891 cur_scalar &= FIXED_FRACTION_MASK;
894 toResample -= header;
896 /* initialising the varius cur
897 * cur_frac is the fractional part of cur with 4 samples. as the
898 * fractional part is 32 bit unsigned value, it can be just added
899 * and the modulu operation for keeping the fractional part will be implicit.
900 * the 0.5 is for converting signed values to float (no unsigned convert),
901 * the 0.5 is added later.
903 cur_frac = vdupq_n_s32(
904 (uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
906 int32_t __attribute__((aligned(16))) data[4] =
909 (uint32_t) (resampleStep & FIXED_FRACTION_MASK),
910 (uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK),
911 (uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK)
913 adder_frac = vld1q_s32(data);
914 cur_frac = vaddq_s32(cur_frac, adder_frac);
916 /* The various cur_scalar is for the different samples
917 * (1, 2, 3 compared to original cur_scalar = 0)
919 cur_scalar_1 = cur_scalar + resampleStep;
920 cur_scalar_2 = cur_scalar + resampleStep * 2;
921 cur_scalar_3 = cur_scalar + resampleStep * 3;
922 dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION);
923 dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION);
924 dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION);
925 cur_scalar &= FIXED_FRACTION_MASK;
926 cur_scalar_1 &= FIXED_FRACTION_MASK;
927 cur_scalar_2 &= FIXED_FRACTION_MASK;
928 cur_scalar_3 &= FIXED_FRACTION_MASK;
930 /* Constants */
931 one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE);
932 half = vdupq_n_f32(0.5f);
933 adder_frac_loop = vdupq_n_s32(
934 (uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK)
937 tail = toResample % 4;
938 for (i = 0; i < toResample - tail; i += 4, resampleCache += 4)
940 /* current next holds 2 pairs of the sample and the sample + 1
941 * after that need to separate them.
943 current_next_0_1 = vcombine_f32(
944 vld1_f32(dCache),
945 vld1_f32(dCache_1)
947 current_next_2_3 = vcombine_f32(
948 vld1_f32(dCache_2),
949 vld1_f32(dCache_3)
952 /* Unpack them to have seperate current and next in 2 vectors. */
953 current = vuzp1q_f32(current_next_0_1, current_next_2_3);
954 next = vuzp2q_f32(current_next_0_1, current_next_2_3);
956 sub = vsubq_f32(next, current);
958 /* Convert the fractional part to float and then mul to get the fractions out.
959 * then add back the 0.5 we subtracted before.
961 cur_fixed = vaddq_f32(
962 vmulq_f32(
963 vcvtq_f32_s32(cur_frac),
964 one_over_fixed_one
966 half
968 mul = vmulq_f32(sub, cur_fixed);
969 res = vaddq_f32(current, mul);
971 /* Store back */
972 vst1q_f32(resampleCache, res);
974 /* Update dCaches for next iteration */
975 cur_scalar += resampleStep * 4;
976 cur_scalar_1 += resampleStep * 4;
977 cur_scalar_2 += resampleStep * 4;
978 cur_scalar_3 += resampleStep * 4;
979 dCache = dCache + (cur_scalar >> FIXED_PRECISION);
980 dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION);
981 dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION);
982 dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION);
983 cur_scalar &= FIXED_FRACTION_MASK;
984 cur_scalar_1 &= FIXED_FRACTION_MASK;
985 cur_scalar_2 &= FIXED_FRACTION_MASK;
986 cur_scalar_3 &= FIXED_FRACTION_MASK;
988 cur_frac = vaddq_s32(cur_frac, adder_frac_loop);
990 *resampleOffset += resampleStep * (toResample - tail);
992 /* This is the tail. */
993 for (i = 0; i < tail; i += 1)
995 /* lerp, then convert to float value */
996 *resampleCache++ = (float) (
997 dCache[0] +
998 (dCache[1] - dCache[0]) *
999 FIXED_TO_FLOAT(cur_scalar)
1002 /* Increment fraction offset by the stepping value */
1003 *resampleOffset += resampleStep;
1004 cur_scalar += resampleStep;
1006 /* Only increment the sample offset by integer values.
1007 * Sometimes this will be 0 until cur accumulates
1008 * enough steps, especially for "slow" rates.
1010 dCache += (cur_scalar >> FIXED_PRECISION);
1012 /* Now that any integer has been added, drop it.
1013 * The offset pointer will preserve the total.
1015 cur_scalar &= FIXED_FRACTION_MASK;
1019 void FAudio_INTERNAL_ResampleStereo_NEON(
1020 float *restrict dCache,
1021 float *restrict resampleCache,
1022 uint64_t *resampleOffset,
1023 uint64_t resampleStep,
1024 uint64_t toResample,
1025 uint8_t channels
1027 uint32_t i, header, tail;
1028 uint64_t cur_scalar, cur_scalar_1;
1029 float *dCache_1;
1030 float32x4_t one_over_fixed_one, half, current, next, sub, cur_fixed, mul, res;
1031 int32x4_t cur_frac, adder_frac, adder_frac_loop;
1033 /* This is the header, the Dest needs to be aligned to 16B */
1034 header = (16 - ((size_t) resampleCache) % 16) / 8;
1035 if (header == 2)
1037 header = 0;
1039 cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
1040 for (i = 0; i < header; i += 2)
1042 /* lerp, then convert to float value */
1043 *resampleCache++ = (float) (
1044 dCache[0] +
1045 (dCache[2] - dCache[0]) *
1046 FIXED_TO_FLOAT(cur_scalar)
1048 *resampleCache++ = (float) (
1049 dCache[1] +
1050 (dCache[3] - dCache[1]) *
1051 FIXED_TO_FLOAT(cur_scalar)
1054 /* Increment fraction offset by the stepping value */
1055 *resampleOffset += resampleStep;
1056 cur_scalar += resampleStep;
1058 /* Only increment the sample offset by integer values.
1059 * Sometimes this will be 0 until cur accumulates
1060 * enough steps, especially for "slow" rates.
1062 dCache += (cur_scalar >> FIXED_PRECISION) * 2;
1064 /* Now that any integer has been added, drop it.
1065 * The offset pointer will preserve the total.
1067 cur_scalar &= FIXED_FRACTION_MASK;
1070 toResample -= header;
1072 /* initialising the varius cur.
1073 * cur_frac holds the fractional part of cur.
1074 * to avoid duplication please see the mono part for a thorough
1075 * explanation.
1077 cur_frac = vdupq_n_s32(
1078 (uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
1080 int32_t __attribute__((aligned(16))) data[4] =
1084 (uint32_t) (resampleStep & FIXED_FRACTION_MASK),
1085 (uint32_t) (resampleStep & FIXED_FRACTION_MASK)
1087 adder_frac = vld1q_s32(data);
1088 cur_frac = vaddq_s32(cur_frac, adder_frac);
1090 /* dCache_1 is the pointer for dcache in the next resample pos. */
1091 cur_scalar_1 = cur_scalar + resampleStep;
1092 dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2;
1093 cur_scalar_1 &= FIXED_FRACTION_MASK;
1095 one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE);
1096 half = vdupq_n_f32(0.5f);
1097 adder_frac_loop = vdupq_n_s32(
1098 (uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK)
1101 tail = toResample % 2;
1102 for (i = 0; i < toResample - tail; i += 2, resampleCache += 4)
1104 /* Current_next_1 and current_next_2 each holds 4 src
1105 * sample points for getting 4 dest resample point at the end.
1106 * current_next_1 holds:
1107 * (current_ch_1, current_ch_2, next_ch_1, next_ch_2)
1108 * for the first resample position, while current_next_2 holds
1109 * the same for the 2nd resample position
1111 current = vcombine_f32(
1112 vld1_f32(dCache), /* A1B1 */
1113 vld1_f32(dCache_1) /* A3B3 */
1115 next = vcombine_f32(
1116 vld1_f32(dCache + 2), /* A2B2 */
1117 vld1_f32(dCache_1 + 2) /* A4B4 */
1120 sub = vsubq_f32(next, current);
1122 /* Adding the 0.5 back.
1123 * See mono explanation for more elaborate explanation.
1125 cur_fixed = vaddq_f32(
1126 vmulq_f32(
1127 vcvtq_f32_s32(cur_frac),
1128 one_over_fixed_one
1130 half
1132 mul = vmulq_f32(sub, cur_fixed);
1133 res = vaddq_f32(current, mul);
1135 /* Store the results */
1136 vst1q_f32(resampleCache, res);
1138 /* Update dCaches for next iteration */
1139 cur_scalar += resampleStep * 2;
1140 cur_scalar_1 += resampleStep * 2;
1141 dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2;
1142 dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2;
1143 cur_scalar &= FIXED_FRACTION_MASK;
1144 cur_scalar_1 &= FIXED_FRACTION_MASK;
1146 cur_frac = vaddq_s32(cur_frac, adder_frac_loop);
1148 *resampleOffset += resampleStep * (toResample - tail);
1150 /* This is the tail. */
1151 for (i = 0; i < tail; i += 1)
1153 /* lerp, then convert to float value */
1154 *resampleCache++ = (float) (
1155 dCache[0] +
1156 (dCache[2] - dCache[0]) *
1157 FIXED_TO_FLOAT(cur_scalar)
1159 *resampleCache++ = (float) (
1160 dCache[1] +
1161 (dCache[3] - dCache[1]) *
1162 FIXED_TO_FLOAT(cur_scalar)
1165 /* Increment fraction offset by the stepping value */
1166 *resampleOffset += resampleStep;
1167 cur_scalar += resampleStep;
1169 /* Only increment the sample offset by integer values.
1170 * Sometimes this will be 0 until cur accumulates
1171 * enough steps, especially for "slow" rates.
1173 dCache += (cur_scalar >> FIXED_PRECISION) * 2;
1175 /* Now that any integer has been added, drop it.
1176 * The offset pointer will preserve the total.
1178 cur_scalar &= FIXED_FRACTION_MASK;
1181 #endif /* HAVE_NEON_INTRINSICS */
1183 /* SECTION 3: Amplifiers */
1186 void FAudio_INTERNAL_Amplify_Scalar(
1187 float* output,
1188 uint32_t totalSamples,
1189 float volume
1191 uint32_t i;
1192 for (i = 0; i < totalSamples; i += 1)
1194 output[i] *= volume;
1199 /* The SSE2 version of the amplifier comes from @8thMage! */
1202 void FAudio_INTERNAL_Amplify_SSE2(
1203 float* output,
1204 uint32_t totalSamples,
1205 float volume
1207 uint32_t i;
1208 uint32_t header = (16 - (((size_t) output) % 16)) / 4;
1209 uint32_t tail = (totalSamples - header) % 4;
1210 __m128 volumeVec, outVec;
1211 if (header == 4)
1213 header = 0;
1215 if (tail == 4)
1217 tail = 0;
1220 for (i = 0; i < header; i += 1)
1222 output[i] *= volume;
1225 volumeVec = _mm_set1_ps(volume);
1226 for (i = header; i < totalSamples - tail; i += 4)
1228 outVec = _mm_load_ps(output + i);
1229 outVec = _mm_mul_ps(outVec, volumeVec);
1230 _mm_store_ps(output + i, outVec);
1233 for (i = totalSamples - tail; i < totalSamples; i += 1)
1235 output[i] *= volume;
1238 #endif /* HAVE_SSE2_INTRINSICS */
1241 void FAudio_INTERNAL_Amplify_NEON(
1242 float* output,
1243 uint32_t totalSamples,
1244 float volume
1246 uint32_t i;
1247 uint32_t header = (16 - (((size_t) output) % 16)) / 4;
1248 uint32_t tail = (totalSamples - header) % 4;
1249 float32x4_t volumeVec, outVec;
1250 if (header == 4)
1252 header = 0;
1254 if (tail == 4)
1256 tail = 0;
1259 for (i = 0; i < header; i += 1)
1261 output[i] *= volume;
1264 volumeVec = vdupq_n_f32(volume);
1265 for (i = header; i < totalSamples - tail; i += 4)
1267 outVec = vld1q_f32(output + i);
1268 outVec = vmulq_f32(outVec, volumeVec);
1269 vst1q_f32(output + i, outVec);
1272 for (i = totalSamples - tail; i < totalSamples; i += 1)
1274 output[i] *= volume;
1277 #endif /* HAVE_NEON_INTRINSICS */
1279 /* SECTION 4: Mixer Functions */
1281 void FAudio_INTERNAL_Mix_Generic_Scalar(
1282 uint32_t toMix,
1283 uint32_t srcChans,
1284 uint32_t dstChans,
1285 float *restrict src,
1286 float *restrict dst,
1287 float *restrict coefficients
1289 uint32_t i, co, ci;
1290 for (i = 0; i < toMix; i += 1, src += srcChans, dst += dstChans)
1291 for (co = 0; co < dstChans; co += 1)
1293 for (ci = 0; ci < srcChans; ci += 1)
1295 dst[co] += (
1296 src[ci] *
1297 coefficients[co * srcChans + ci]
1304 /* SSE horizontal add by Peter Cordes, CC-BY-SA.
1305 * From https://stackoverflow.com/a/35270026 */
1306 static inline float FAudio_simd_hadd(__m128 v)
1308 __m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));
1309 __m128 sums = _mm_add_ps(v, shuf);
1310 shuf = _mm_movehl_ps(shuf, sums);
1311 sums = _mm_add_ss(sums, shuf);
1312 return _mm_cvtss_f32(sums);
1315 void FAudio_INTERNAL_Mix_Generic_SSE2(
1316 uint32_t toMix,
1317 uint32_t srcChans,
1318 uint32_t dstChans,
1319 float *restrict src,
1320 float *restrict dst,
1321 float *restrict coefficients
1323 uint32_t i, co, ci;
1324 for (i = 0; i < toMix; i += 1, src += srcChans, dst += dstChans)
1325 for (co = 0; co < dstChans; co += 1)
1327 for (ci = 0; srcChans - ci >= 4; ci += 4)
1329 /* do SIMD */
1330 const __m128 vols = _mm_loadu_ps(&coefficients[co * srcChans + ci]);
1331 const __m128 dat = _mm_loadu_ps(&src[ci]);
1332 dst[co] += FAudio_simd_hadd(_mm_mul_ps(dat, vols));
1335 for (; ci < srcChans; ci += 1)
1337 /* do scalar */
1338 dst[co] += (
1339 src[ci] *
1340 coefficients[co * srcChans + ci]
1345 #endif /* HAVE_SSE2_INTRINSICS */
1347 void FAudio_INTERNAL_Mix_1in_1out_Scalar(
1348 uint32_t toMix,
1349 uint32_t UNUSED1,
1350 uint32_t UNUSED2,
1351 float *restrict src,
1352 float *restrict dst,
1353 float *restrict coefficients
1355 uint32_t i;
1356 for (i = 0; i < toMix; i += 1, src += 1, dst += 1)
1358 /* Base source data, combined with the coefficients */
1359 dst[0] += src[0] * coefficients[0];
1363 void FAudio_INTERNAL_Mix_1in_2out_Scalar(
1364 uint32_t toMix,
1365 uint32_t UNUSED1,
1366 uint32_t UNUSED2,
1367 float *restrict src,
1368 float *restrict dst,
1369 float *restrict coefficients
1371 uint32_t i;
1372 for (i = 0; i < toMix; i += 1, src += 1, dst += 2)
1374 dst[0] += src[0] * coefficients[0];
1375 dst[1] += src[0] * coefficients[1];
1379 void FAudio_INTERNAL_Mix_1in_6out_Scalar(
1380 uint32_t toMix,
1381 uint32_t UNUSED1,
1382 uint32_t UNUSED2,
1383 float *restrict src,
1384 float *restrict dst,
1385 float *restrict coefficients
1387 uint32_t i;
1388 for (i = 0; i < toMix; i += 1, src += 1, dst += 6)
1390 dst[0] += src[0] * coefficients[0];
1391 dst[1] += src[0] * coefficients[1];
1392 dst[2] += src[0] * coefficients[2];
1393 dst[3] += src[0] * coefficients[3];
1394 dst[4] += src[0] * coefficients[4];
1395 dst[5] += src[0] * coefficients[5];
1399 void FAudio_INTERNAL_Mix_1in_8out_Scalar(
1400 uint32_t toMix,
1401 uint32_t UNUSED1,
1402 uint32_t UNUSED2,
1403 float *restrict src,
1404 float *restrict dst,
1405 float *restrict coefficients
1407 uint32_t i;
1408 for (i = 0; i < toMix; i += 1, src += 1, dst += 8)
1410 dst[0] += src[0] * coefficients[0];
1411 dst[1] += src[0] * coefficients[1];
1412 dst[2] += src[0] * coefficients[2];
1413 dst[3] += src[0] * coefficients[3];
1414 dst[4] += src[0] * coefficients[4];
1415 dst[5] += src[0] * coefficients[5];
1416 dst[6] += src[0] * coefficients[6];
1417 dst[7] += src[0] * coefficients[7];
1421 void FAudio_INTERNAL_Mix_2in_1out_Scalar(
1422 uint32_t toMix,
1423 uint32_t UNUSED1,
1424 uint32_t UNUSED2,
1425 float *restrict src,
1426 float *restrict dst,
1427 float *restrict coefficients
1429 uint32_t i;
1430 for (i = 0; i < toMix; i += 1, src += 2, dst += 1)
1432 /* Base source data, combined with the coefficients */
1433 dst[0] += (
1434 (src[0] * coefficients[0]) +
1435 (src[1] * coefficients[1])
1440 void FAudio_INTERNAL_Mix_2in_2out_Scalar(
1441 uint32_t toMix,
1442 uint32_t UNUSED1,
1443 uint32_t UNUSED2,
1444 float *restrict src,
1445 float *restrict dst,
1446 float *restrict coefficients
1448 uint32_t i;
1449 for (i = 0; i < toMix; i += 1, src += 2, dst += 2)
1451 dst[0] += (
1452 (src[0] * coefficients[0]) +
1453 (src[1] * coefficients[1])
1455 dst[1] += (
1456 (src[0] * coefficients[2]) +
1457 (src[1] * coefficients[3])
1462 void FAudio_INTERNAL_Mix_2in_6out_Scalar(
1463 uint32_t toMix,
1464 uint32_t UNUSED1,
1465 uint32_t UNUSED2,
1466 float *restrict src,
1467 float *restrict dst,
1468 float *restrict coefficients
1470 uint32_t i;
1471 for (i = 0; i < toMix; i += 1, src += 2, dst += 6)
1473 dst[0] += (
1474 (src[0] * coefficients[0]) +
1475 (src[1] * coefficients[1])
1477 dst[1] += (
1478 (src[0] * coefficients[2]) +
1479 (src[1] * coefficients[3])
1481 dst[2] += (
1482 (src[0] * coefficients[4]) +
1483 (src[1] * coefficients[5])
1485 dst[3] += (
1486 (src[0] * coefficients[6]) +
1487 (src[1] * coefficients[7])
1489 dst[4] += (
1490 (src[0] * coefficients[8]) +
1491 (src[1] * coefficients[9])
1493 dst[5] += (
1494 (src[0] * coefficients[10]) +
1495 (src[1] * coefficients[11])
1500 void FAudio_INTERNAL_Mix_2in_8out_Scalar(
1501 uint32_t toMix,
1502 uint32_t UNUSED1,
1503 uint32_t UNUSED2,
1504 float *restrict src,
1505 float *restrict dst,
1506 float *restrict coefficients
1508 uint32_t i;
1509 for (i = 0; i < toMix; i += 1, src += 2, dst += 8)
1511 dst[0] += (
1512 (src[0] * coefficients[0]) +
1513 (src[1] * coefficients[1])
1515 dst[1] += (
1516 (src[0] * coefficients[2]) +
1517 (src[1] * coefficients[3])
1519 dst[2] += (
1520 (src[0] * coefficients[4]) +
1521 (src[1] * coefficients[5])
1523 dst[3] += (
1524 (src[0] * coefficients[6]) +
1525 (src[1] * coefficients[7])
1527 dst[4] += (
1528 (src[0] * coefficients[8]) +
1529 (src[1] * coefficients[9])
1531 dst[5] += (
1532 (src[0] * coefficients[10]) +
1533 (src[1] * coefficients[11])
1535 dst[6] += (
1536 (src[0] * coefficients[12]) +
1537 (src[1] * coefficients[13])
1539 dst[7] += (
1540 (src[0] * coefficients[14]) +
1541 (src[1] * coefficients[15])
1546 /* SECTION 5: InitSIMDFunctions. Assigns based on SSE2/NEON support. */
1548 void (*FAudio_INTERNAL_Convert_U8_To_F32)(
1549 const uint8_t *restrict src,
1550 float *restrict dst,
1551 uint32_t len
1553 void (*FAudio_INTERNAL_Convert_S16_To_F32)(
1554 const int16_t *restrict src,
1555 float *restrict dst,
1556 uint32_t len
1558 void (*FAudio_INTERNAL_Convert_S32_To_F32)(
1559 const int32_t *restrict src,
1560 float *restrict dst,
1561 uint32_t len
1564 FAudioResampleCallback FAudio_INTERNAL_ResampleMono;
1565 FAudioResampleCallback FAudio_INTERNAL_ResampleStereo;
1567 void (*FAudio_INTERNAL_Amplify)(
1568 float *output,
1569 uint32_t totalSamples,
1570 float volume
1573 FAudioMixCallback FAudio_INTERNAL_Mix_Generic;
1575 void FAudio_INTERNAL_InitSIMDFunctions(uint8_t hasSSE2, uint8_t hasNEON)
1578 if (hasSSE2)
1580 FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_SSE2;
1581 FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_SSE2;
1582 FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_SSE2;
1583 FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_SSE2;
1584 FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_SSE2;
1585 FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_SSE2;
1586 FAudio_INTERNAL_Mix_Generic = FAudio_INTERNAL_Mix_Generic_SSE2;
1587 return;
1589 #endif
1591 if (hasNEON)
1593 FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_NEON;
1594 FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_NEON;
1595 FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_NEON;
1596 FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_NEON;
1597 FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_NEON;
1598 FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_NEON;
1599 FAudio_INTERNAL_Mix_Generic = FAudio_INTERNAL_Mix_Generic_Scalar;
1600 return;
1602 #endif
1604 FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_Scalar;
1605 FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_Scalar;
1606 FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_Scalar;
1607 FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_Scalar;
1608 FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_Scalar;
1609 FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_Scalar;
1610 FAudio_INTERNAL_Mix_Generic = FAudio_INTERNAL_Mix_Generic_Scalar;
1611 #else
1612 FAudio_assert(0 && "Need converter functions!");
1613 #endif
1616 /* vim: set noexpandtab shiftwidth=8 tabstop=8: */