1 /* FAudio - XAudio Reimplementation for FNA
3 * Copyright (c) 2011-2024 Ethan Lee, Luigi Auriemma, and the MonoGame Team
5 * This software is provided 'as-is', without any express or implied warranty.
6 * In no event will the authors be held liable for any damages arising from
7 * the use of this software.
9 * Permission is granted to anyone to use this software for any purpose,
10 * including commercial applications, and to alter it and redistribute it
11 * freely, subject to the following restrictions:
13 * 1. The origin of this software must not be misrepresented; you must not
14 * claim that you wrote the original software. If you use this software in a
15 * product, an acknowledgment in the product documentation would be
16 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
23 * Ethan "flibitijibibo" Lee <flibitijibibo@flibitijibibo.com>
27 #include "FAudio_internal.h"
29 /* SECTION 0: SSE/NEON Detection */
31 /* The SSE/NEON detection comes from MojoAL:
32 * https://hg.icculus.org/icculus/mojoAL/file/default/mojoal.c
35 #if defined(__x86_64__) || defined(_M_X64)
36 /* Some platforms fail to define this... */
41 /* x86_64 guarantees SSE2. */
42 #define NEED_SCALAR_CONVERTER_FALLBACKS 0
43 #elif defined(__aarch64__) || defined(_M_ARM64)
44 /* Some platforms fail to define this... */
46 #define __ARM_NEON__ 1
49 /* AArch64 guarantees NEON. */
50 #define NEED_SCALAR_CONVERTER_FALLBACKS 0
51 #elif __MACOSX__ && !defined(__POWERPC__)
52 /* Some build systems may need to specify this. */
53 #if !defined(__SSE2__) && !defined(__ARM_NEON__)
54 #error macOS does not have SSE2/NEON? Bad compiler?
57 /* Mac OS X/Intel guarantees SSE2. */
58 #define NEED_SCALAR_CONVERTER_FALLBACKS 0
60 /* Need plain C implementations to support all other hardware */
61 #define NEED_SCALAR_CONVERTER_FALLBACKS 1
64 /* Our NEON paths require AArch64, don't check __ARM_NEON__ here */
65 #if defined(__aarch64__) || defined(_M_ARM64)
67 #define HAVE_NEON_INTRINSICS 1
72 #include <emmintrin.h>
73 #define HAVE_SSE2_INTRINSICS 1
76 /* SECTION 1: Type Converters */
78 /* The SSE/NEON converters are based on SDL_audiotypecvt:
79 * https://hg.libsdl.org/SDL/file/default/src/audio/SDL_audiotypecvt.c
82 #define DIVBY128 0.0078125f
83 #define DIVBY32768 0.000030517578125f
84 #define DIVBY8388607 0.00000011920930376163766f
86 #if NEED_SCALAR_CONVERTER_FALLBACKS
87 void FAudio_INTERNAL_Convert_U8_To_F32_Scalar(
88 const uint8_t *restrict src
,
93 for (i
= 0; i
< len
; i
+= 1)
95 *dst
++ = (*src
++ * DIVBY128
) - 1.0f
;
99 void FAudio_INTERNAL_Convert_S16_To_F32_Scalar(
100 const int16_t *restrict src
,
105 for (i
= 0; i
< len
; i
+= 1)
107 *dst
++ = *src
++ * DIVBY32768
;
111 void FAudio_INTERNAL_Convert_S32_To_F32_Scalar(
112 const int32_t *restrict src
,
117 for (i
= 0; i
< len
; i
+= 1)
119 *dst
++ = (*src
++ >> 8) * DIVBY8388607
;
122 #endif /* NEED_SCALAR_CONVERTER_FALLBACKS */
124 #if HAVE_SSE2_INTRINSICS
125 void FAudio_INTERNAL_Convert_U8_To_F32_SSE2(
126 const uint8_t *restrict src
,
134 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
135 for (i
= len
; i
&& (((size_t) (dst
-15)) & 15); --i
, --src
, --dst
) {
136 *dst
= (((float) *src
) * DIVBY128
) - 1.0f
;
139 src
-= 15; dst
-= 15; /* adjust to read SSE blocks from the start. */
140 FAudio_assert(!i
|| ((((size_t) dst
) & 15) == 0));
142 /* Make sure src is aligned too. */
143 if ((((size_t) src
) & 15) == 0) {
144 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
145 const __m128i
*mmsrc
= (const __m128i
*) src
;
146 const __m128i zero
= _mm_setzero_si128();
147 const __m128 divby128
= _mm_set1_ps(DIVBY128
);
148 const __m128 minus1
= _mm_set1_ps(-1.0f
);
149 while (i
>= 16) { /* 16 * 8-bit */
150 const __m128i bytes
= _mm_load_si128(mmsrc
); /* get 16 uint8 into an XMM register. */
151 /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
152 const __m128i shorts1
= _mm_srli_epi16(_mm_slli_epi16(bytes
, 8), 8);
153 /* right-shift-zero-extend gets us uint16 with the other set of values. */
154 const __m128i shorts2
= _mm_srli_epi16(bytes
, 8);
155 /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
156 /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
157 const __m128 floats1
= _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1
, zero
)), divby128
), minus1
);
158 const __m128 floats2
= _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2
, zero
)), divby128
), minus1
);
159 const __m128 floats3
= _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1
, zero
)), divby128
), minus1
);
160 const __m128 floats4
= _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2
, zero
)), divby128
), minus1
);
161 /* Interleave back into correct order, store. */
162 _mm_store_ps(dst
, _mm_unpacklo_ps(floats1
, floats2
));
163 _mm_store_ps(dst
+4, _mm_unpackhi_ps(floats1
, floats2
));
164 _mm_store_ps(dst
+8, _mm_unpacklo_ps(floats3
, floats4
));
165 _mm_store_ps(dst
+12, _mm_unpackhi_ps(floats3
, floats4
));
166 i
-= 16; mmsrc
--; dst
-= 16;
169 src
= (const uint8_t *) mmsrc
;
172 src
+= 15; dst
+= 15; /* adjust for any scalar finishing. */
174 /* Finish off any leftovers with scalar operations. */
176 *dst
= (((float) *src
) * DIVBY128
) - 1.0f
;
181 void FAudio_INTERNAL_Convert_S16_To_F32_SSE2(
182 const int16_t *restrict src
,
190 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
191 for (i
= len
; i
&& (((size_t) (dst
-7)) & 15); --i
, --src
, --dst
) {
192 *dst
= ((float) *src
) * DIVBY32768
;
195 src
-= 7; dst
-= 7; /* adjust to read SSE blocks from the start. */
196 FAudio_assert(!i
|| ((((size_t) dst
) & 15) == 0));
198 /* Make sure src is aligned too. */
199 if ((((size_t) src
) & 15) == 0) {
200 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
201 const __m128 divby32768
= _mm_set1_ps(DIVBY32768
);
202 while (i
>= 8) { /* 8 * 16-bit */
203 const __m128i ints
= _mm_load_si128((__m128i
const *) src
); /* get 8 sint16 into an XMM register. */
204 /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
205 const __m128i a
= _mm_srai_epi32(_mm_slli_epi32(ints
, 16), 16);
206 /* right-shift-sign-extend gets us sint32 with the other set of values. */
207 const __m128i b
= _mm_srai_epi32(ints
, 16);
208 /* Interleave these back into the right order, convert to float, multiply, store. */
209 _mm_store_ps(dst
, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a
, b
)), divby32768
));
210 _mm_store_ps(dst
+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a
, b
)), divby32768
));
211 i
-= 8; src
-= 8; dst
-= 8;
215 src
+= 7; dst
+= 7; /* adjust for any scalar finishing. */
217 /* Finish off any leftovers with scalar operations. */
219 *dst
= ((float) *src
) * DIVBY32768
;
224 void FAudio_INTERNAL_Convert_S32_To_F32_SSE2(
225 const int32_t *restrict src
,
231 /* Get dst aligned to 16 bytes */
232 for (i
= len
; i
&& (((size_t) dst
) & 15); --i
, ++src
, ++dst
) {
233 *dst
= ((float) (*src
>>8)) * DIVBY8388607
;
236 FAudio_assert(!i
|| ((((size_t) dst
) & 15) == 0));
238 /* Make sure src is aligned too. */
239 if ((((size_t) src
) & 15) == 0) {
240 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
241 const __m128 divby8388607
= _mm_set1_ps(DIVBY8388607
);
242 const __m128i
*mmsrc
= (const __m128i
*) src
;
243 while (i
>= 4) { /* 4 * sint32 */
244 /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
245 _mm_store_ps(dst
, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc
), 8)), divby8388607
));
246 i
-= 4; mmsrc
++; dst
+= 4;
248 src
= (const int32_t *) mmsrc
;
251 /* Finish off any leftovers with scalar operations. */
253 *dst
= ((float) (*src
>>8)) * DIVBY8388607
;
257 #endif /* HAVE_SSE2_INTRINSICS */
259 #if HAVE_NEON_INTRINSICS
260 void FAudio_INTERNAL_Convert_U8_To_F32_NEON(
261 const uint8_t *restrict src
,
269 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
270 for (i
= len
; i
&& (((size_t) (dst
-15)) & 15); --i
, --src
, --dst
) {
271 *dst
= (((float) *src
) * DIVBY128
) - 1.0f
;
274 src
-= 15; dst
-= 15; /* adjust to read NEON blocks from the start. */
275 FAudio_assert(!i
|| ((((size_t) dst
) & 15) == 0));
277 /* Make sure src is aligned too. */
278 if ((((size_t) src
) & 15) == 0) {
279 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
280 const uint8_t *mmsrc
= (const uint8_t *) src
;
281 const float32x4_t divby128
= vdupq_n_f32(DIVBY128
);
282 const float32x4_t negone
= vdupq_n_f32(-1.0f
);
283 while (i
>= 16) { /* 16 * 8-bit */
284 const uint8x16_t bytes
= vld1q_u8(mmsrc
); /* get 16 uint8 into a NEON register. */
285 const uint16x8_t uint16hi
= vmovl_u8(vget_high_u8(bytes
)); /* convert top 8 bytes to 8 uint16 */
286 const uint16x8_t uint16lo
= vmovl_u8(vget_low_u8(bytes
)); /* convert bottom 8 bytes to 8 uint16 */
287 /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
288 vst1q_f32(dst
, vmlaq_f32(negone
, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo
))), divby128
));
289 vst1q_f32(dst
+4, vmlaq_f32(negone
, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo
))), divby128
));
290 vst1q_f32(dst
+8, vmlaq_f32(negone
, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi
))), divby128
));
291 vst1q_f32(dst
+12, vmlaq_f32(negone
, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi
))), divby128
));
292 i
-= 16; mmsrc
-= 16; dst
-= 16;
295 src
= (const uint8_t *) mmsrc
;
298 src
+= 15; dst
+= 15; /* adjust for any scalar finishing. */
300 /* Finish off any leftovers with scalar operations. */
302 *dst
= (((float) *src
) * DIVBY128
) - 1.0f
;
307 void FAudio_INTERNAL_Convert_S16_To_F32_NEON(
308 const int16_t *restrict src
,
316 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
317 for (i
= len
; i
&& (((size_t) (dst
-7)) & 15); --i
, --src
, --dst
) {
318 *dst
= ((float) *src
) * DIVBY32768
;
321 src
-= 7; dst
-= 7; /* adjust to read NEON blocks from the start. */
322 FAudio_assert(!i
|| ((((size_t) dst
) & 15) == 0));
324 /* Make sure src is aligned too. */
325 if ((((size_t) src
) & 15) == 0) {
326 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
327 const float32x4_t divby32768
= vdupq_n_f32(DIVBY32768
);
328 while (i
>= 8) { /* 8 * 16-bit */
329 const int16x8_t ints
= vld1q_s16((int16_t const *) src
); /* get 8 sint16 into a NEON register. */
330 /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
331 vst1q_f32(dst
, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints
))), divby32768
));
332 vst1q_f32(dst
+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints
))), divby32768
));
333 i
-= 8; src
-= 8; dst
-= 8;
337 src
+= 7; dst
+= 7; /* adjust for any scalar finishing. */
339 /* Finish off any leftovers with scalar operations. */
341 *dst
= ((float) *src
) * DIVBY32768
;
346 void FAudio_INTERNAL_Convert_S32_To_F32_NEON(
347 const int32_t *restrict src
,
353 /* Get dst aligned to 16 bytes */
354 for (i
= len
; i
&& (((size_t) dst
) & 15); --i
, ++src
, ++dst
) {
355 *dst
= ((float) (*src
>>8)) * DIVBY8388607
;
358 FAudio_assert(!i
|| ((((size_t) dst
) & 15) == 0));
360 /* Make sure src is aligned too. */
361 if ((((size_t) src
) & 15) == 0) {
362 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
363 const float32x4_t divby8388607
= vdupq_n_f32(DIVBY8388607
);
364 const int32_t *mmsrc
= (const int32_t *) src
;
365 while (i
>= 4) { /* 4 * sint32 */
366 /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
367 vst1q_f32(dst
, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc
), 8)), divby8388607
));
368 i
-= 4; mmsrc
+= 4; dst
+= 4;
370 src
= (const int32_t *) mmsrc
;
373 /* Finish off any leftovers with scalar operations. */
375 *dst
= ((float) (*src
>>8)) * DIVBY8388607
;
379 #endif /* HAVE_NEON_INTRINSICS */
381 /* SECTION 2: Linear Resamplers */
383 void FAudio_INTERNAL_ResampleGeneric(
384 float *restrict dCache
,
385 float *restrict resampleCache
,
386 uint64_t *resampleOffset
,
387 uint64_t resampleStep
,
392 uint64_t cur
= *resampleOffset
& FIXED_FRACTION_MASK
;
393 for (i
= 0; i
< toResample
; i
+= 1)
395 for (j
= 0; j
< channels
; j
+= 1)
397 /* lerp, then convert to float value */
398 *resampleCache
++ = (float) (
400 (dCache
[j
+ channels
] - dCache
[j
]) *
405 /* Increment fraction offset by the stepping value */
406 *resampleOffset
+= resampleStep
;
409 /* Only increment the sample offset by integer values.
410 * Sometimes this will be 0 until cur accumulates
411 * enough steps, especially for "slow" rates.
413 dCache
+= (cur
>> FIXED_PRECISION
) * channels
;
415 /* Now that any integer has been added, drop it.
416 * The offset pointer will preserve the total.
418 cur
&= FIXED_FRACTION_MASK
;
422 #if NEED_SCALAR_CONVERTER_FALLBACKS
423 void FAudio_INTERNAL_ResampleMono_Scalar(
424 float *restrict dCache
,
425 float *restrict resampleCache
,
426 uint64_t *resampleOffset
,
427 uint64_t resampleStep
,
432 uint64_t cur
= *resampleOffset
& FIXED_FRACTION_MASK
;
433 for (i
= 0; i
< toResample
; i
+= 1)
435 /* lerp, then convert to float value */
436 *resampleCache
++ = (float) (
438 (dCache
[1] - dCache
[0]) *
442 /* Increment fraction offset by the stepping value */
443 *resampleOffset
+= resampleStep
;
446 /* Only increment the sample offset by integer values.
447 * Sometimes this will be 0 until cur accumulates
448 * enough steps, especially for "slow" rates.
450 dCache
+= (cur
>> FIXED_PRECISION
);
452 /* Now that any integer has been added, drop it.
453 * The offset pointer will preserve the total.
455 cur
&= FIXED_FRACTION_MASK
;
459 void FAudio_INTERNAL_ResampleStereo_Scalar(
460 float *restrict dCache
,
461 float *restrict resampleCache
,
462 uint64_t *resampleOffset
,
463 uint64_t resampleStep
,
468 uint64_t cur
= *resampleOffset
& FIXED_FRACTION_MASK
;
469 for (i
= 0; i
< toResample
; i
+= 1)
471 /* lerp, then convert to float value */
472 *resampleCache
++ = (float) (
474 (dCache
[2] - dCache
[0]) *
477 *resampleCache
++ = (float) (
479 (dCache
[3] - dCache
[1]) *
483 /* Increment fraction offset by the stepping value */
484 *resampleOffset
+= resampleStep
;
487 /* Only increment the sample offset by integer values.
488 * Sometimes this will be 0 until cur accumulates
489 * enough steps, especially for "slow" rates.
491 dCache
+= (cur
>> FIXED_PRECISION
) * 2;
493 /* Now that any integer has been added, drop it.
494 * The offset pointer will preserve the total.
496 cur
&= FIXED_FRACTION_MASK
;
499 #endif /* NEED_SCALAR_CONVERTER_FALLBACKS */
501 /* The SSE2 versions of the resamplers come from @8thMage! */
503 #if HAVE_SSE2_INTRINSICS
504 void FAudio_INTERNAL_ResampleMono_SSE2(
505 float *restrict dCache
,
506 float *restrict resampleCache
,
507 uint64_t *resampleOffset
,
508 uint64_t resampleStep
,
512 uint32_t i
, header
, tail
;
513 uint64_t cur_scalar_1
, cur_scalar_2
, cur_scalar_3
;
514 float *dCache_1
, *dCache_2
, *dCache_3
;
515 uint64_t cur_scalar
= *resampleOffset
& FIXED_FRACTION_MASK
;
516 __m128 one_over_fixed_one
, half
, current_next_0_1
, current_next_2_3
,
517 current
, next
, sub
, cur_fixed
, mul
, res
;
518 __m128i cur_frac
, adder_frac
, adder_frac_loop
;
520 /* This is the header, the Dest needs to be aligned to 16B */
521 header
= (16 - ((size_t) resampleCache
) % 16) / 4;
526 for (i
= 0; i
< header
; i
+= 1)
528 /* lerp, then convert to float value */
529 *resampleCache
++ = (float) (
531 (dCache
[1] - dCache
[0]) *
532 FIXED_TO_FLOAT(cur_scalar
)
535 /* Increment fraction offset by the stepping value */
536 *resampleOffset
+= resampleStep
;
537 cur_scalar
+= resampleStep
;
539 /* Only increment the sample offset by integer values.
540 * Sometimes this will be 0 until cur accumulates
541 * enough steps, especially for "slow" rates.
543 dCache
+= (cur_scalar
>> FIXED_PRECISION
);
545 /* Now that any integer has been added, drop it.
546 * The offset pointer will preserve the total.
548 cur_scalar
&= FIXED_FRACTION_MASK
;
551 toResample
-= header
;
553 /* initialising the varius cur
554 * cur_frac is the fractional part of cur with 4 samples. as the
555 * fractional part is 32 bit unsigned value, it can be just added
556 * and the modulu operation for keeping the fractional part will be implicit.
557 * the 0.5 is for converting signed values to float (no unsigned convert),
558 * the 0.5 is added later.
560 cur_frac
= _mm_set1_epi32(
561 (uint32_t) (cur_scalar
& FIXED_FRACTION_MASK
) - DOUBLE_TO_FIXED(0.5)
563 adder_frac
= _mm_setr_epi32(
565 (uint32_t) (resampleStep
& FIXED_FRACTION_MASK
),
566 (uint32_t) ((resampleStep
* 2) & FIXED_FRACTION_MASK
),
567 (uint32_t) ((resampleStep
* 3) & FIXED_FRACTION_MASK
)
569 cur_frac
= _mm_add_epi32(cur_frac
, adder_frac
);
571 /* The various cur_scalar is for the different samples
572 * (1, 2, 3 compared to original cur_scalar = 0)
574 cur_scalar_1
= cur_scalar
+ resampleStep
;
575 cur_scalar_2
= cur_scalar
+ resampleStep
* 2;
576 cur_scalar_3
= cur_scalar
+ resampleStep
* 3;
577 dCache_1
= dCache
+ (cur_scalar_1
>> FIXED_PRECISION
);
578 dCache_2
= dCache
+ (cur_scalar_2
>> FIXED_PRECISION
);
579 dCache_3
= dCache
+ (cur_scalar_3
>> FIXED_PRECISION
);
580 cur_scalar
&= FIXED_FRACTION_MASK
;
581 cur_scalar_1
&= FIXED_FRACTION_MASK
;
582 cur_scalar_2
&= FIXED_FRACTION_MASK
;
583 cur_scalar_3
&= FIXED_FRACTION_MASK
;
585 /* FIXME: These should be _mm_undefined_ps! */
586 current_next_0_1
= _mm_setzero_ps();
587 current_next_2_3
= _mm_setzero_ps();
590 one_over_fixed_one
= _mm_set1_ps(1.0f
/ FIXED_ONE
);
591 half
= _mm_set1_ps(0.5f
);
592 adder_frac_loop
= _mm_set1_epi32(
593 (uint32_t) ((resampleStep
* 4) & FIXED_FRACTION_MASK
)
596 tail
= toResample
% 4;
597 for (i
= 0; i
< toResample
- tail
; i
+= 4, resampleCache
+= 4)
599 /* current next holds 2 pairs of the sample and the sample + 1
600 * after that need to seperate them.
603 current_next_0_1
= _mm_loadl_pi(current_next_0_1
, (__m64
*) dCache
);
604 current_next_0_1
= _mm_loadh_pi(current_next_0_1
, (__m64
*) dCache_1
);
605 current_next_2_3
= _mm_loadl_pi(current_next_2_3
, (__m64
*) dCache_2
);
606 current_next_2_3
= _mm_loadh_pi(current_next_2_3
, (__m64
*) dCache_3
);
608 /* Unpack them to have seperate current and next in 2 vectors. */
609 current
= _mm_shuffle_ps(current_next_0_1
, current_next_2_3
, 0x88); /* 0b1000 */
610 next
= _mm_shuffle_ps(current_next_0_1
, current_next_2_3
, 0xdd); /* 0b1101 */
612 sub
= _mm_sub_ps(next
, current
);
614 /* Convert the fractional part to float and then mul to get the fractions out.
615 * then add back the 0.5 we subtracted before.
617 cur_fixed
= _mm_add_ps(
619 _mm_cvtepi32_ps(cur_frac
),
624 mul
= _mm_mul_ps(sub
, cur_fixed
);
625 res
= _mm_add_ps(current
, mul
);
628 _mm_store_ps(resampleCache
, res
);
630 /* Update dCaches for next iteration */
631 cur_scalar
+= resampleStep
* 4;
632 cur_scalar_1
+= resampleStep
* 4;
633 cur_scalar_2
+= resampleStep
* 4;
634 cur_scalar_3
+= resampleStep
* 4;
635 dCache
= dCache
+ (cur_scalar
>> FIXED_PRECISION
);
636 dCache_1
= dCache_1
+ (cur_scalar_1
>> FIXED_PRECISION
);
637 dCache_2
= dCache_2
+ (cur_scalar_2
>> FIXED_PRECISION
);
638 dCache_3
= dCache_3
+ (cur_scalar_3
>> FIXED_PRECISION
);
639 cur_scalar
&= FIXED_FRACTION_MASK
;
640 cur_scalar_1
&= FIXED_FRACTION_MASK
;
641 cur_scalar_2
&= FIXED_FRACTION_MASK
;
642 cur_scalar_3
&= FIXED_FRACTION_MASK
;
644 cur_frac
= _mm_add_epi32(cur_frac
, adder_frac_loop
);
646 *resampleOffset
+= resampleStep
* (toResample
- tail
);
648 /* This is the tail. */
649 for (i
= 0; i
< tail
; i
+= 1)
651 /* lerp, then convert to float value */
652 *resampleCache
++ = (float) (
654 (dCache
[1] - dCache
[0]) *
655 FIXED_TO_FLOAT(cur_scalar
)
658 /* Increment fraction offset by the stepping value */
659 *resampleOffset
+= resampleStep
;
660 cur_scalar
+= resampleStep
;
662 /* Only increment the sample offset by integer values.
663 * Sometimes this will be 0 until cur accumulates
664 * enough steps, especially for "slow" rates.
666 dCache
+= (cur_scalar
>> FIXED_PRECISION
);
668 /* Now that any integer has been added, drop it.
669 * The offset pointer will preserve the total.
671 cur_scalar
&= FIXED_FRACTION_MASK
;
675 void FAudio_INTERNAL_ResampleStereo_SSE2(
676 float *restrict dCache
,
677 float *restrict resampleCache
,
678 uint64_t *resampleOffset
,
679 uint64_t resampleStep
,
683 uint32_t i
, header
, tail
;
684 uint64_t cur_scalar
, cur_scalar_1
;
686 __m128 one_over_fixed_one
, half
, current_next_1
, current_next_2
,
687 current
, next
, sub
, cur_fixed
, mul
, res
;
688 __m128i cur_frac
, adder_frac
, adder_frac_loop
;
690 /* This is the header, the Dest needs to be aligned to 16B */
691 header
= (16 - ((size_t) resampleCache
) % 16) / 8;
696 cur_scalar
= *resampleOffset
& FIXED_FRACTION_MASK
;
697 for (i
= 0; i
< header
; i
+= 2)
699 /* lerp, then convert to float value */
700 *resampleCache
++ = (float) (
702 (dCache
[2] - dCache
[0]) *
703 FIXED_TO_FLOAT(cur_scalar
)
705 *resampleCache
++ = (float) (
707 (dCache
[3] - dCache
[1]) *
708 FIXED_TO_FLOAT(cur_scalar
)
711 /* Increment fraction offset by the stepping value */
712 *resampleOffset
+= resampleStep
;
713 cur_scalar
+= resampleStep
;
715 /* Only increment the sample offset by integer values.
716 * Sometimes this will be 0 until cur accumulates
717 * enough steps, especially for "slow" rates.
719 dCache
+= (cur_scalar
>> FIXED_PRECISION
) * 2;
721 /* Now that any integer has been added, drop it.
722 * The offset pointer will preserve the total.
724 cur_scalar
&= FIXED_FRACTION_MASK
;
727 toResample
-= header
;
729 /* initialising the varius cur.
730 * cur_frac holds the fractional part of cur.
731 * to avoid duplication please see the mono part for a thorough
734 cur_frac
= _mm_set1_epi32(
735 (uint32_t) (cur_scalar
& FIXED_FRACTION_MASK
) - DOUBLE_TO_FIXED(0.5)
737 adder_frac
= _mm_setr_epi32(
740 (uint32_t) (resampleStep
& FIXED_FRACTION_MASK
),
741 (uint32_t) (resampleStep
& FIXED_FRACTION_MASK
)
743 cur_frac
= _mm_add_epi32(cur_frac
, adder_frac
);
745 /* dCache_1 is the pointer for dcache in the next resample pos. */
746 cur_scalar_1
= cur_scalar
+ resampleStep
;
747 dCache_1
= dCache
+ (cur_scalar_1
>> FIXED_PRECISION
) * 2;
748 cur_scalar_1
&= FIXED_FRACTION_MASK
;
750 one_over_fixed_one
= _mm_set1_ps(1.0f
/ FIXED_ONE
);
751 half
= _mm_set1_ps(0.5f
);
752 adder_frac_loop
= _mm_set1_epi32(
753 (uint32_t) ((resampleStep
* 2) & FIXED_FRACTION_MASK
)
756 tail
= toResample
% 2;
757 for (i
= 0; i
< toResample
- tail
; i
+= 2, resampleCache
+= 4)
759 /* Current_next_1 and current_next_2 each holds 4 src
760 * sample points for getting 4 dest resample point at the end.
761 * current_next_1 holds:
762 * (current_ch_1, current_ch_2, next_ch_1, next_ch_2)
763 * for the first resample position, while current_next_2 holds
764 * the same for the 2nd resample position
766 current_next_1
= _mm_loadu_ps(dCache
); /* A1B1A2B2 */
767 current_next_2
= _mm_loadu_ps(dCache_1
); /* A3B3A4B4 */
769 /* Unpack them to get the current and the next in seperate vectors. */
770 current
= _mm_castpd_ps(
772 _mm_castps_pd(current_next_1
),
773 _mm_castps_pd(current_next_2
)
776 next
= _mm_castpd_ps(
778 _mm_castps_pd(current_next_1
),
779 _mm_castps_pd(current_next_2
)
783 sub
= _mm_sub_ps(next
, current
);
785 /* Adding the 0.5 back.
786 * See mono explanation for more elaborate explanation.
788 cur_fixed
= _mm_add_ps(
790 _mm_cvtepi32_ps(cur_frac
),
795 mul
= _mm_mul_ps(sub
, cur_fixed
);
796 res
= _mm_add_ps(current
, mul
);
798 /* Store the results */
799 _mm_store_ps(resampleCache
, res
);
801 /* Update dCaches for next iteration */
802 cur_scalar
+= resampleStep
* 2;
803 cur_scalar_1
+= resampleStep
* 2;
804 dCache
= dCache
+ (cur_scalar
>> FIXED_PRECISION
) * 2;
805 dCache_1
= dCache_1
+ (cur_scalar_1
>> FIXED_PRECISION
) * 2;
806 cur_scalar
&= FIXED_FRACTION_MASK
;
807 cur_scalar_1
&= FIXED_FRACTION_MASK
;
809 cur_frac
= _mm_add_epi32(cur_frac
, adder_frac_loop
);
811 *resampleOffset
+= resampleStep
* (toResample
- tail
);
813 /* This is the tail. */
814 for (i
= 0; i
< tail
; i
+= 1)
816 /* lerp, then convert to float value */
817 *resampleCache
++ = (float) (
819 (dCache
[2] - dCache
[0]) *
820 FIXED_TO_FLOAT(cur_scalar
)
822 *resampleCache
++ = (float) (
824 (dCache
[3] - dCache
[1]) *
825 FIXED_TO_FLOAT(cur_scalar
)
828 /* Increment fraction offset by the stepping value */
829 *resampleOffset
+= resampleStep
;
830 cur_scalar
+= resampleStep
;
832 /* Only increment the sample offset by integer values.
833 * Sometimes this will be 0 until cur accumulates
834 * enough steps, especially for "slow" rates.
836 dCache
+= (cur_scalar
>> FIXED_PRECISION
) * 2;
838 /* Now that any integer has been added, drop it.
839 * The offset pointer will preserve the total.
841 cur_scalar
&= FIXED_FRACTION_MASK
;
844 #endif /* HAVE_SSE2_INTRINSICS */
846 #if HAVE_NEON_INTRINSICS
847 void FAudio_INTERNAL_ResampleMono_NEON(
848 float *restrict dCache
,
849 float *restrict resampleCache
,
850 uint64_t *resampleOffset
,
851 uint64_t resampleStep
,
855 uint32_t i
, header
, tail
;
856 uint64_t cur_scalar_1
, cur_scalar_2
, cur_scalar_3
;
857 float *dCache_1
, *dCache_2
, *dCache_3
;
858 uint64_t cur_scalar
= *resampleOffset
& FIXED_FRACTION_MASK
;
859 float32x4_t one_over_fixed_one
, half
, current_next_0_1
, current_next_2_3
,
860 current
, next
, sub
, cur_fixed
, mul
, res
;
861 int32x4_t cur_frac
, adder_frac
, adder_frac_loop
;
863 /* This is the header, the Dest needs to be aligned to 16B */
864 header
= (16 - ((size_t) resampleCache
) % 16) / 4;
869 for (i
= 0; i
< header
; i
+= 1)
871 /* lerp, then convert to float value */
872 *resampleCache
++ = (float) (
874 (dCache
[1] - dCache
[0]) *
875 FIXED_TO_FLOAT(cur_scalar
)
878 /* Increment fraction offset by the stepping value */
879 *resampleOffset
+= resampleStep
;
880 cur_scalar
+= resampleStep
;
882 /* Only increment the sample offset by integer values.
883 * Sometimes this will be 0 until cur accumulates
884 * enough steps, especially for "slow" rates.
886 dCache
+= (cur_scalar
>> FIXED_PRECISION
);
888 /* Now that any integer has been added, drop it.
889 * The offset pointer will preserve the total.
891 cur_scalar
&= FIXED_FRACTION_MASK
;
894 toResample
-= header
;
896 /* initialising the varius cur
897 * cur_frac is the fractional part of cur with 4 samples. as the
898 * fractional part is 32 bit unsigned value, it can be just added
899 * and the modulu operation for keeping the fractional part will be implicit.
900 * the 0.5 is for converting signed values to float (no unsigned convert),
901 * the 0.5 is added later.
903 cur_frac
= vdupq_n_s32(
904 (uint32_t) (cur_scalar
& FIXED_FRACTION_MASK
) - DOUBLE_TO_FIXED(0.5)
906 int32_t __attribute__((aligned(16))) data
[4] =
909 (uint32_t) (resampleStep
& FIXED_FRACTION_MASK
),
910 (uint32_t) ((resampleStep
* 2) & FIXED_FRACTION_MASK
),
911 (uint32_t) ((resampleStep
* 3) & FIXED_FRACTION_MASK
)
913 adder_frac
= vld1q_s32(data
);
914 cur_frac
= vaddq_s32(cur_frac
, adder_frac
);
916 /* The various cur_scalar is for the different samples
917 * (1, 2, 3 compared to original cur_scalar = 0)
919 cur_scalar_1
= cur_scalar
+ resampleStep
;
920 cur_scalar_2
= cur_scalar
+ resampleStep
* 2;
921 cur_scalar_3
= cur_scalar
+ resampleStep
* 3;
922 dCache_1
= dCache
+ (cur_scalar_1
>> FIXED_PRECISION
);
923 dCache_2
= dCache
+ (cur_scalar_2
>> FIXED_PRECISION
);
924 dCache_3
= dCache
+ (cur_scalar_3
>> FIXED_PRECISION
);
925 cur_scalar
&= FIXED_FRACTION_MASK
;
926 cur_scalar_1
&= FIXED_FRACTION_MASK
;
927 cur_scalar_2
&= FIXED_FRACTION_MASK
;
928 cur_scalar_3
&= FIXED_FRACTION_MASK
;
931 one_over_fixed_one
= vdupq_n_f32(1.0f
/ FIXED_ONE
);
932 half
= vdupq_n_f32(0.5f
);
933 adder_frac_loop
= vdupq_n_s32(
934 (uint32_t) ((resampleStep
* 4) & FIXED_FRACTION_MASK
)
937 tail
= toResample
% 4;
938 for (i
= 0; i
< toResample
- tail
; i
+= 4, resampleCache
+= 4)
940 /* current next holds 2 pairs of the sample and the sample + 1
941 * after that need to separate them.
943 current_next_0_1
= vcombine_f32(
947 current_next_2_3
= vcombine_f32(
952 /* Unpack them to have seperate current and next in 2 vectors. */
953 current
= vuzp1q_f32(current_next_0_1
, current_next_2_3
);
954 next
= vuzp2q_f32(current_next_0_1
, current_next_2_3
);
956 sub
= vsubq_f32(next
, current
);
958 /* Convert the fractional part to float and then mul to get the fractions out.
959 * then add back the 0.5 we subtracted before.
961 cur_fixed
= vaddq_f32(
963 vcvtq_f32_s32(cur_frac
),
968 mul
= vmulq_f32(sub
, cur_fixed
);
969 res
= vaddq_f32(current
, mul
);
972 vst1q_f32(resampleCache
, res
);
974 /* Update dCaches for next iteration */
975 cur_scalar
+= resampleStep
* 4;
976 cur_scalar_1
+= resampleStep
* 4;
977 cur_scalar_2
+= resampleStep
* 4;
978 cur_scalar_3
+= resampleStep
* 4;
979 dCache
= dCache
+ (cur_scalar
>> FIXED_PRECISION
);
980 dCache_1
= dCache_1
+ (cur_scalar_1
>> FIXED_PRECISION
);
981 dCache_2
= dCache_2
+ (cur_scalar_2
>> FIXED_PRECISION
);
982 dCache_3
= dCache_3
+ (cur_scalar_3
>> FIXED_PRECISION
);
983 cur_scalar
&= FIXED_FRACTION_MASK
;
984 cur_scalar_1
&= FIXED_FRACTION_MASK
;
985 cur_scalar_2
&= FIXED_FRACTION_MASK
;
986 cur_scalar_3
&= FIXED_FRACTION_MASK
;
988 cur_frac
= vaddq_s32(cur_frac
, adder_frac_loop
);
990 *resampleOffset
+= resampleStep
* (toResample
- tail
);
992 /* This is the tail. */
993 for (i
= 0; i
< tail
; i
+= 1)
995 /* lerp, then convert to float value */
996 *resampleCache
++ = (float) (
998 (dCache
[1] - dCache
[0]) *
999 FIXED_TO_FLOAT(cur_scalar
)
1002 /* Increment fraction offset by the stepping value */
1003 *resampleOffset
+= resampleStep
;
1004 cur_scalar
+= resampleStep
;
1006 /* Only increment the sample offset by integer values.
1007 * Sometimes this will be 0 until cur accumulates
1008 * enough steps, especially for "slow" rates.
1010 dCache
+= (cur_scalar
>> FIXED_PRECISION
);
1012 /* Now that any integer has been added, drop it.
1013 * The offset pointer will preserve the total.
1015 cur_scalar
&= FIXED_FRACTION_MASK
;
1019 void FAudio_INTERNAL_ResampleStereo_NEON(
1020 float *restrict dCache
,
1021 float *restrict resampleCache
,
1022 uint64_t *resampleOffset
,
1023 uint64_t resampleStep
,
1024 uint64_t toResample
,
1027 uint32_t i
, header
, tail
;
1028 uint64_t cur_scalar
, cur_scalar_1
;
1030 float32x4_t one_over_fixed_one
, half
, current
, next
, sub
, cur_fixed
, mul
, res
;
1031 int32x4_t cur_frac
, adder_frac
, adder_frac_loop
;
1033 /* This is the header, the Dest needs to be aligned to 16B */
1034 header
= (16 - ((size_t) resampleCache
) % 16) / 8;
1039 cur_scalar
= *resampleOffset
& FIXED_FRACTION_MASK
;
1040 for (i
= 0; i
< header
; i
+= 2)
1042 /* lerp, then convert to float value */
1043 *resampleCache
++ = (float) (
1045 (dCache
[2] - dCache
[0]) *
1046 FIXED_TO_FLOAT(cur_scalar
)
1048 *resampleCache
++ = (float) (
1050 (dCache
[3] - dCache
[1]) *
1051 FIXED_TO_FLOAT(cur_scalar
)
1054 /* Increment fraction offset by the stepping value */
1055 *resampleOffset
+= resampleStep
;
1056 cur_scalar
+= resampleStep
;
1058 /* Only increment the sample offset by integer values.
1059 * Sometimes this will be 0 until cur accumulates
1060 * enough steps, especially for "slow" rates.
1062 dCache
+= (cur_scalar
>> FIXED_PRECISION
) * 2;
1064 /* Now that any integer has been added, drop it.
1065 * The offset pointer will preserve the total.
1067 cur_scalar
&= FIXED_FRACTION_MASK
;
1070 toResample
-= header
;
1072 /* initialising the varius cur.
1073 * cur_frac holds the fractional part of cur.
1074 * to avoid duplication please see the mono part for a thorough
1077 cur_frac
= vdupq_n_s32(
1078 (uint32_t) (cur_scalar
& FIXED_FRACTION_MASK
) - DOUBLE_TO_FIXED(0.5)
1080 int32_t __attribute__((aligned(16))) data
[4] =
1084 (uint32_t) (resampleStep
& FIXED_FRACTION_MASK
),
1085 (uint32_t) (resampleStep
& FIXED_FRACTION_MASK
)
1087 adder_frac
= vld1q_s32(data
);
1088 cur_frac
= vaddq_s32(cur_frac
, adder_frac
);
1090 /* dCache_1 is the pointer for dcache in the next resample pos. */
1091 cur_scalar_1
= cur_scalar
+ resampleStep
;
1092 dCache_1
= dCache
+ (cur_scalar_1
>> FIXED_PRECISION
) * 2;
1093 cur_scalar_1
&= FIXED_FRACTION_MASK
;
1095 one_over_fixed_one
= vdupq_n_f32(1.0f
/ FIXED_ONE
);
1096 half
= vdupq_n_f32(0.5f
);
1097 adder_frac_loop
= vdupq_n_s32(
1098 (uint32_t) ((resampleStep
* 2) & FIXED_FRACTION_MASK
)
1101 tail
= toResample
% 2;
1102 for (i
= 0; i
< toResample
- tail
; i
+= 2, resampleCache
+= 4)
1104 /* Current_next_1 and current_next_2 each holds 4 src
1105 * sample points for getting 4 dest resample point at the end.
1106 * current_next_1 holds:
1107 * (current_ch_1, current_ch_2, next_ch_1, next_ch_2)
1108 * for the first resample position, while current_next_2 holds
1109 * the same for the 2nd resample position
1111 current
= vcombine_f32(
1112 vld1_f32(dCache
), /* A1B1 */
1113 vld1_f32(dCache_1
) /* A3B3 */
1115 next
= vcombine_f32(
1116 vld1_f32(dCache
+ 2), /* A2B2 */
1117 vld1_f32(dCache_1
+ 2) /* A4B4 */
1120 sub
= vsubq_f32(next
, current
);
1122 /* Adding the 0.5 back.
1123 * See mono explanation for more elaborate explanation.
1125 cur_fixed
= vaddq_f32(
1127 vcvtq_f32_s32(cur_frac
),
1132 mul
= vmulq_f32(sub
, cur_fixed
);
1133 res
= vaddq_f32(current
, mul
);
1135 /* Store the results */
1136 vst1q_f32(resampleCache
, res
);
1138 /* Update dCaches for next iteration */
1139 cur_scalar
+= resampleStep
* 2;
1140 cur_scalar_1
+= resampleStep
* 2;
1141 dCache
= dCache
+ (cur_scalar
>> FIXED_PRECISION
) * 2;
1142 dCache_1
= dCache_1
+ (cur_scalar_1
>> FIXED_PRECISION
) * 2;
1143 cur_scalar
&= FIXED_FRACTION_MASK
;
1144 cur_scalar_1
&= FIXED_FRACTION_MASK
;
1146 cur_frac
= vaddq_s32(cur_frac
, adder_frac_loop
);
1148 *resampleOffset
+= resampleStep
* (toResample
- tail
);
1150 /* This is the tail. */
1151 for (i
= 0; i
< tail
; i
+= 1)
1153 /* lerp, then convert to float value */
1154 *resampleCache
++ = (float) (
1156 (dCache
[2] - dCache
[0]) *
1157 FIXED_TO_FLOAT(cur_scalar
)
1159 *resampleCache
++ = (float) (
1161 (dCache
[3] - dCache
[1]) *
1162 FIXED_TO_FLOAT(cur_scalar
)
1165 /* Increment fraction offset by the stepping value */
1166 *resampleOffset
+= resampleStep
;
1167 cur_scalar
+= resampleStep
;
1169 /* Only increment the sample offset by integer values.
1170 * Sometimes this will be 0 until cur accumulates
1171 * enough steps, especially for "slow" rates.
1173 dCache
+= (cur_scalar
>> FIXED_PRECISION
) * 2;
1175 /* Now that any integer has been added, drop it.
1176 * The offset pointer will preserve the total.
1178 cur_scalar
&= FIXED_FRACTION_MASK
;
1181 #endif /* HAVE_NEON_INTRINSICS */
1183 /* SECTION 3: Amplifiers */
1185 #if NEED_SCALAR_CONVERTER_FALLBACKS
1186 void FAudio_INTERNAL_Amplify_Scalar(
1188 uint32_t totalSamples
,
1192 for (i
= 0; i
< totalSamples
; i
+= 1)
1194 output
[i
] *= volume
;
1197 #endif /* NEED_SCALAR_CONVERTER_FALLBACKS */
1199 /* The SSE2 version of the amplifier comes from @8thMage! */
1201 #if HAVE_SSE2_INTRINSICS
1202 void FAudio_INTERNAL_Amplify_SSE2(
1204 uint32_t totalSamples
,
1208 uint32_t header
= (16 - (((size_t) output
) % 16)) / 4;
1209 uint32_t tail
= (totalSamples
- header
) % 4;
1210 __m128 volumeVec
, outVec
;
1220 for (i
= 0; i
< header
; i
+= 1)
1222 output
[i
] *= volume
;
1225 volumeVec
= _mm_set1_ps(volume
);
1226 for (i
= header
; i
< totalSamples
- tail
; i
+= 4)
1228 outVec
= _mm_load_ps(output
+ i
);
1229 outVec
= _mm_mul_ps(outVec
, volumeVec
);
1230 _mm_store_ps(output
+ i
, outVec
);
1233 for (i
= totalSamples
- tail
; i
< totalSamples
; i
+= 1)
1235 output
[i
] *= volume
;
1238 #endif /* HAVE_SSE2_INTRINSICS */
1240 #if HAVE_NEON_INTRINSICS
1241 void FAudio_INTERNAL_Amplify_NEON(
1243 uint32_t totalSamples
,
1247 uint32_t header
= (16 - (((size_t) output
) % 16)) / 4;
1248 uint32_t tail
= (totalSamples
- header
) % 4;
1249 float32x4_t volumeVec
, outVec
;
1259 for (i
= 0; i
< header
; i
+= 1)
1261 output
[i
] *= volume
;
1264 volumeVec
= vdupq_n_f32(volume
);
1265 for (i
= header
; i
< totalSamples
- tail
; i
+= 4)
1267 outVec
= vld1q_f32(output
+ i
);
1268 outVec
= vmulq_f32(outVec
, volumeVec
);
1269 vst1q_f32(output
+ i
, outVec
);
1272 for (i
= totalSamples
- tail
; i
< totalSamples
; i
+= 1)
1274 output
[i
] *= volume
;
1277 #endif /* HAVE_NEON_INTRINSICS */
1279 /* SECTION 4: Mixer Functions */
1281 void FAudio_INTERNAL_Mix_Generic_Scalar(
1285 float *restrict src
,
1286 float *restrict dst
,
1287 float *restrict coefficients
1290 for (i
= 0; i
< toMix
; i
+= 1, src
+= srcChans
, dst
+= dstChans
)
1291 for (co
= 0; co
< dstChans
; co
+= 1)
1293 for (ci
= 0; ci
< srcChans
; ci
+= 1)
1297 coefficients
[co
* srcChans
+ ci
]
1303 #if HAVE_SSE2_INTRINSICS
1304 /* SSE horizontal add by Peter Cordes, CC-BY-SA.
1305 * From https://stackoverflow.com/a/35270026 */
1306 static inline float FAudio_simd_hadd(__m128 v
)
1308 __m128 shuf
= _mm_shuffle_ps(v
, v
, _MM_SHUFFLE(2, 3, 0, 1));
1309 __m128 sums
= _mm_add_ps(v
, shuf
);
1310 shuf
= _mm_movehl_ps(shuf
, sums
);
1311 sums
= _mm_add_ss(sums
, shuf
);
1312 return _mm_cvtss_f32(sums
);
1315 void FAudio_INTERNAL_Mix_Generic_SSE2(
1319 float *restrict src
,
1320 float *restrict dst
,
1321 float *restrict coefficients
1324 for (i
= 0; i
< toMix
; i
+= 1, src
+= srcChans
, dst
+= dstChans
)
1325 for (co
= 0; co
< dstChans
; co
+= 1)
1327 for (ci
= 0; srcChans
- ci
>= 4; ci
+= 4)
1330 const __m128 vols
= _mm_loadu_ps(&coefficients
[co
* srcChans
+ ci
]);
1331 const __m128 dat
= _mm_loadu_ps(&src
[ci
]);
1332 dst
[co
] += FAudio_simd_hadd(_mm_mul_ps(dat
, vols
));
1335 for (; ci
< srcChans
; ci
+= 1)
1340 coefficients
[co
* srcChans
+ ci
]
1345 #endif /* HAVE_SSE2_INTRINSICS */
1347 void FAudio_INTERNAL_Mix_1in_1out_Scalar(
1351 float *restrict src
,
1352 float *restrict dst
,
1353 float *restrict coefficients
1356 for (i
= 0; i
< toMix
; i
+= 1, src
+= 1, dst
+= 1)
1358 /* Base source data, combined with the coefficients */
1359 dst
[0] += src
[0] * coefficients
[0];
1363 void FAudio_INTERNAL_Mix_1in_2out_Scalar(
1367 float *restrict src
,
1368 float *restrict dst
,
1369 float *restrict coefficients
1372 for (i
= 0; i
< toMix
; i
+= 1, src
+= 1, dst
+= 2)
1374 dst
[0] += src
[0] * coefficients
[0];
1375 dst
[1] += src
[0] * coefficients
[1];
1379 void FAudio_INTERNAL_Mix_1in_6out_Scalar(
1383 float *restrict src
,
1384 float *restrict dst
,
1385 float *restrict coefficients
1388 for (i
= 0; i
< toMix
; i
+= 1, src
+= 1, dst
+= 6)
1390 dst
[0] += src
[0] * coefficients
[0];
1391 dst
[1] += src
[0] * coefficients
[1];
1392 dst
[2] += src
[0] * coefficients
[2];
1393 dst
[3] += src
[0] * coefficients
[3];
1394 dst
[4] += src
[0] * coefficients
[4];
1395 dst
[5] += src
[0] * coefficients
[5];
1399 void FAudio_INTERNAL_Mix_1in_8out_Scalar(
1403 float *restrict src
,
1404 float *restrict dst
,
1405 float *restrict coefficients
1408 for (i
= 0; i
< toMix
; i
+= 1, src
+= 1, dst
+= 8)
1410 dst
[0] += src
[0] * coefficients
[0];
1411 dst
[1] += src
[0] * coefficients
[1];
1412 dst
[2] += src
[0] * coefficients
[2];
1413 dst
[3] += src
[0] * coefficients
[3];
1414 dst
[4] += src
[0] * coefficients
[4];
1415 dst
[5] += src
[0] * coefficients
[5];
1416 dst
[6] += src
[0] * coefficients
[6];
1417 dst
[7] += src
[0] * coefficients
[7];
1421 void FAudio_INTERNAL_Mix_2in_1out_Scalar(
1425 float *restrict src
,
1426 float *restrict dst
,
1427 float *restrict coefficients
1430 for (i
= 0; i
< toMix
; i
+= 1, src
+= 2, dst
+= 1)
1432 /* Base source data, combined with the coefficients */
1434 (src
[0] * coefficients
[0]) +
1435 (src
[1] * coefficients
[1])
1440 void FAudio_INTERNAL_Mix_2in_2out_Scalar(
1444 float *restrict src
,
1445 float *restrict dst
,
1446 float *restrict coefficients
1449 for (i
= 0; i
< toMix
; i
+= 1, src
+= 2, dst
+= 2)
1452 (src
[0] * coefficients
[0]) +
1453 (src
[1] * coefficients
[1])
1456 (src
[0] * coefficients
[2]) +
1457 (src
[1] * coefficients
[3])
1462 void FAudio_INTERNAL_Mix_2in_6out_Scalar(
1466 float *restrict src
,
1467 float *restrict dst
,
1468 float *restrict coefficients
1471 for (i
= 0; i
< toMix
; i
+= 1, src
+= 2, dst
+= 6)
1474 (src
[0] * coefficients
[0]) +
1475 (src
[1] * coefficients
[1])
1478 (src
[0] * coefficients
[2]) +
1479 (src
[1] * coefficients
[3])
1482 (src
[0] * coefficients
[4]) +
1483 (src
[1] * coefficients
[5])
1486 (src
[0] * coefficients
[6]) +
1487 (src
[1] * coefficients
[7])
1490 (src
[0] * coefficients
[8]) +
1491 (src
[1] * coefficients
[9])
1494 (src
[0] * coefficients
[10]) +
1495 (src
[1] * coefficients
[11])
1500 void FAudio_INTERNAL_Mix_2in_8out_Scalar(
1504 float *restrict src
,
1505 float *restrict dst
,
1506 float *restrict coefficients
1509 for (i
= 0; i
< toMix
; i
+= 1, src
+= 2, dst
+= 8)
1512 (src
[0] * coefficients
[0]) +
1513 (src
[1] * coefficients
[1])
1516 (src
[0] * coefficients
[2]) +
1517 (src
[1] * coefficients
[3])
1520 (src
[0] * coefficients
[4]) +
1521 (src
[1] * coefficients
[5])
1524 (src
[0] * coefficients
[6]) +
1525 (src
[1] * coefficients
[7])
1528 (src
[0] * coefficients
[8]) +
1529 (src
[1] * coefficients
[9])
1532 (src
[0] * coefficients
[10]) +
1533 (src
[1] * coefficients
[11])
1536 (src
[0] * coefficients
[12]) +
1537 (src
[1] * coefficients
[13])
1540 (src
[0] * coefficients
[14]) +
1541 (src
[1] * coefficients
[15])
1546 /* SECTION 5: InitSIMDFunctions. Assigns based on SSE2/NEON support. */
1548 void (*FAudio_INTERNAL_Convert_U8_To_F32
)(
1549 const uint8_t *restrict src
,
1550 float *restrict dst
,
1553 void (*FAudio_INTERNAL_Convert_S16_To_F32
)(
1554 const int16_t *restrict src
,
1555 float *restrict dst
,
1558 void (*FAudio_INTERNAL_Convert_S32_To_F32
)(
1559 const int32_t *restrict src
,
1560 float *restrict dst
,
1564 FAudioResampleCallback FAudio_INTERNAL_ResampleMono
;
1565 FAudioResampleCallback FAudio_INTERNAL_ResampleStereo
;
1567 void (*FAudio_INTERNAL_Amplify
)(
1569 uint32_t totalSamples
,
1573 FAudioMixCallback FAudio_INTERNAL_Mix_Generic
;
1575 void FAudio_INTERNAL_InitSIMDFunctions(uint8_t hasSSE2
, uint8_t hasNEON
)
1577 #if HAVE_SSE2_INTRINSICS
1580 FAudio_INTERNAL_Convert_U8_To_F32
= FAudio_INTERNAL_Convert_U8_To_F32_SSE2
;
1581 FAudio_INTERNAL_Convert_S16_To_F32
= FAudio_INTERNAL_Convert_S16_To_F32_SSE2
;
1582 FAudio_INTERNAL_Convert_S32_To_F32
= FAudio_INTERNAL_Convert_S32_To_F32_SSE2
;
1583 FAudio_INTERNAL_ResampleMono
= FAudio_INTERNAL_ResampleMono_SSE2
;
1584 FAudio_INTERNAL_ResampleStereo
= FAudio_INTERNAL_ResampleStereo_SSE2
;
1585 FAudio_INTERNAL_Amplify
= FAudio_INTERNAL_Amplify_SSE2
;
1586 FAudio_INTERNAL_Mix_Generic
= FAudio_INTERNAL_Mix_Generic_SSE2
;
1590 #if HAVE_NEON_INTRINSICS
1593 FAudio_INTERNAL_Convert_U8_To_F32
= FAudio_INTERNAL_Convert_U8_To_F32_NEON
;
1594 FAudio_INTERNAL_Convert_S16_To_F32
= FAudio_INTERNAL_Convert_S16_To_F32_NEON
;
1595 FAudio_INTERNAL_Convert_S32_To_F32
= FAudio_INTERNAL_Convert_S32_To_F32_NEON
;
1596 FAudio_INTERNAL_ResampleMono
= FAudio_INTERNAL_ResampleMono_NEON
;
1597 FAudio_INTERNAL_ResampleStereo
= FAudio_INTERNAL_ResampleStereo_NEON
;
1598 FAudio_INTERNAL_Amplify
= FAudio_INTERNAL_Amplify_NEON
;
1599 FAudio_INTERNAL_Mix_Generic
= FAudio_INTERNAL_Mix_Generic_Scalar
;
1603 #if NEED_SCALAR_CONVERTER_FALLBACKS
1604 FAudio_INTERNAL_Convert_U8_To_F32
= FAudio_INTERNAL_Convert_U8_To_F32_Scalar
;
1605 FAudio_INTERNAL_Convert_S16_To_F32
= FAudio_INTERNAL_Convert_S16_To_F32_Scalar
;
1606 FAudio_INTERNAL_Convert_S32_To_F32
= FAudio_INTERNAL_Convert_S32_To_F32_Scalar
;
1607 FAudio_INTERNAL_ResampleMono
= FAudio_INTERNAL_ResampleMono_Scalar
;
1608 FAudio_INTERNAL_ResampleStereo
= FAudio_INTERNAL_ResampleStereo_Scalar
;
1609 FAudio_INTERNAL_Amplify
= FAudio_INTERNAL_Amplify_Scalar
;
1610 FAudio_INTERNAL_Mix_Generic
= FAudio_INTERNAL_Mix_Generic_Scalar
;
1612 FAudio_assert(0 && "Need converter functions!");
1616 /* vim: set noexpandtab shiftwidth=8 tabstop=8: */