2 Copyright (C) 2000 Paul Davis
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 #define _ISOC9X_SOURCE 1
21 #define _ISOC99_SOURCE 1
23 #define __USE_ISOC9X 1
24 #define __USE_ISOC99 1
38 #if defined (__SSE2__) && !defined (__sun__)
39 #include <emmintrin.h>
41 #include <smmintrin.h>
45 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
49 /* Notes about these *_SCALING values.
51 the MAX_<N>BIT values are floating point. when multiplied by
52 a full-scale normalized floating point sample value (-1.0..+1.0)
53 they should give the maximum value representable with an integer
54 sample type of N bits. Note that this is asymmetric. Sample ranges
55 for signed integer, 2's complement values are -(2^(N-1) to +(2^(N-1)-1)
59 If we use +2^(N-1) for the scaling factors, we run into a problem:
61 if we start with a normalized float value of -1.0, scaling
62 to 24 bits would give -8388608 (-2^23), which is ideal.
63 But with +1.0, we get +8388608, which is technically out of range.
65 We never multiply a full range normalized value by this constant,
66 but we could multiply it by a positive value that is close enough to +1.0
67 to produce a value > +(2^(N-1)-1.
69 There is no way around this paradox without wasting CPU cycles to determine
70 which scaling factor to use (i.e. determine if its negative or not,
71 use the right factor).
73 So, for now (October 2008) we use 2^(N-1)-1 as the scaling factor.
76 #define SAMPLE_24BIT_SCALING 8388607.0f
77 #define SAMPLE_16BIT_SCALING 32767.0f
79 /* these are just values to use if the floating point value was out of range
81 advice from Fons Adriaensen: make the limits symmetrical
84 #define SAMPLE_24BIT_MAX 8388607
85 #define SAMPLE_24BIT_MIN -8388607
86 #define SAMPLE_24BIT_MAX_F 8388607.0f
87 #define SAMPLE_24BIT_MIN_F -8388607.0f
89 #define SAMPLE_16BIT_MAX 32767
90 #define SAMPLE_16BIT_MIN -32767
91 #define SAMPLE_16BIT_MAX_F 32767.0f
92 #define SAMPLE_16BIT_MIN_F -32767.0f
94 /* these mark the outer edges of the range considered "within" range
95 for a floating point sample value. values outside (and on the boundaries)
96 of this range will be clipped before conversion; values within this
97 range will be scaled to appropriate values for the target sample
101 #define NORMALIZED_FLOAT_MIN -1.0f
102 #define NORMALIZED_FLOAT_MAX 1.0f
104 /* define this in case we end up on a platform that is missing
105 the real lrintf functions
108 #define f_round(f) lrintf(f)
110 #define float_16(s, d)\
111 if ((s) <= NORMALIZED_FLOAT_MIN) {\
112 (d) = SAMPLE_16BIT_MIN;\
113 } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
114 (d) = SAMPLE_16BIT_MAX;\
116 (d) = f_round ((s) * SAMPLE_16BIT_SCALING);\
119 /* call this when "s" has already been scaled (e.g. when dithering)
122 #define float_16_scaled(s, d)\
123 if ((s) <= SAMPLE_16BIT_MIN_F) {\
124 (d) = SAMPLE_16BIT_MIN_F;\
125 } else if ((s) >= SAMPLE_16BIT_MAX_F) { \
126 (d) = SAMPLE_16BIT_MAX;\
128 (d) = f_round ((s));\
131 #define float_24u32(s, d) \
132 if ((s) <= NORMALIZED_FLOAT_MIN) {\
133 (d) = SAMPLE_24BIT_MIN << 8;\
134 } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
135 (d) = SAMPLE_24BIT_MAX << 8;\
137 (d) = f_round ((s) * SAMPLE_24BIT_SCALING) << 8;\
140 /* call this when "s" has already been scaled (e.g. when dithering)
143 #define float_24u32_scaled(s, d)\
144 if ((s) <= SAMPLE_24BIT_MIN_F) {\
145 (d) = SAMPLE_24BIT_MIN << 8;\
146 } else if ((s) >= SAMPLE_24BIT_MAX_F) { \
147 (d) = SAMPLE_24BIT_MAX << 8; \
149 (d) = f_round ((s)) << 8; \
152 #define float_24(s, d) \
153 if ((s) <= NORMALIZED_FLOAT_MIN) {\
154 (d) = SAMPLE_24BIT_MIN;\
155 } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
156 (d) = SAMPLE_24BIT_MAX;\
158 (d) = f_round ((s) * SAMPLE_24BIT_SCALING);\
161 /* call this when "s" has already been scaled (e.g. when dithering)
164 #define float_24_scaled(s, d)\
165 if ((s) <= SAMPLE_24BIT_MIN_F) {\
166 (d) = SAMPLE_24BIT_MIN;\
167 } else if ((s) >= SAMPLE_24BIT_MAX_F) { \
168 (d) = SAMPLE_24BIT_MAX; \
170 (d) = f_round ((s)); \
174 #if defined (__SSE2__) && !defined (__sun__)
176 /* generates same as _mm_set_ps(1.f, 1.f, 1f., 1f) but faster */
177 static inline __m128
gen_one(void)
179 volatile __m128i x
= { 0 }; /* shut up, GCC */
180 __m128i ones
= _mm_cmpeq_epi32(x
, x
);
181 return (__m128
)_mm_slli_epi32 (_mm_srli_epi32(ones
, 25), 23);
184 static inline __m128
clip(__m128 s
, __m128 min
, __m128 max
)
186 return _mm_min_ps(max
, _mm_max_ps(s
, min
));
189 static inline __m128i
float_24_sse(__m128 s
)
191 const __m128 upper_bound
= gen_one(); /* NORMALIZED_FLOAT_MAX */
192 const __m128 lower_bound
= _mm_sub_ps(_mm_setzero_ps(), upper_bound
);
194 __m128 clipped
= clip(s
, lower_bound
, upper_bound
);
195 __m128 scaled
= _mm_mul_ps(clipped
, _mm_set1_ps(SAMPLE_24BIT_SCALING
));
196 return _mm_cvtps_epi32(scaled
);
201 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
203 static inline float32x4_t
clip(float32x4_t s
, float32x4_t min
, float32x4_t max
)
205 return vminq_f32(max
, vmaxq_f32(s
, min
));
208 static inline int32x4_t
float_24_neon(float32x4_t s
)
210 const float32x4_t upper_bound
= vdupq_n_f32(NORMALIZED_FLOAT_MAX
);
211 const float32x4_t lower_bound
= vdupq_n_f32(NORMALIZED_FLOAT_MIN
);
213 float32x4_t clipped
= clip(s
, lower_bound
, upper_bound
);
214 float32x4_t scaled
= vmulq_f32(clipped
, vdupq_n_f32(SAMPLE_24BIT_SCALING
));
215 return vcvtq_s32_f32(scaled
);
218 static inline int16x4_t
float_16_neon(float32x4_t s
)
220 const float32x4_t upper_bound
= vdupq_n_f32(NORMALIZED_FLOAT_MAX
);
221 const float32x4_t lower_bound
= vdupq_n_f32(NORMALIZED_FLOAT_MIN
);
223 float32x4_t clipped
= clip(s
, lower_bound
, upper_bound
);
224 float32x4_t scaled
= vmulq_f32(clipped
, vdupq_n_f32(SAMPLE_16BIT_SCALING
));
225 return vmovn_s32(vcvtq_s32_f32(scaled
));
229 /* Linear Congruential noise generator. From the music-dsp list
230 * less random than rand(), but good enough and 10x faster
232 static unsigned int seed
= 22222;
234 static inline unsigned int fast_rand() {
235 seed
= (seed
* 196314165) + 907633515;
239 /* functions for native float sample data */
241 void sample_move_floatLE_sSs (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
) {
243 *dst
= *((float *) src
);
249 void sample_move_dS_floatLE (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
) {
251 *((float *) dst
) = *src
;
257 /* NOTES on function naming:
259 foo_bar_d<TYPE>_s<TYPE>
261 the "d<TYPE>" component defines the destination type for the operation
262 the "s<TYPE>" component defines the source type for the operation
266 S - sample is a jack_default_audio_sample_t, currently (October 2008) a 32 bit floating point value
267 Ss - like S but reverse endian from the host CPU
268 32u24 - sample is an signed 32 bit integer value, but data is in upper 24 bits only
269 32u24s - like 32u24 but reverse endian from the host CPU
270 24 - sample is an signed 24 bit integer value
271 24s - like 24 but reverse endian from the host CPU
272 16 - sample is an signed 16 bit integer value
273 16s - like 16 but reverse endian from the host CPU
275 For obvious reasons, the reverse endian versions only show as source types.
277 This covers all known sample formats at 16 bits or larger.
280 /* functions for native integer sample data */
282 void sample_move_d32u24_sSs (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
284 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
285 unsigned long unrolled
= nsamples
/ 4;
286 nsamples
= nsamples
& 3;
289 float32x4_t samples
= vld1q_f32(src
);
290 int32x4_t converted
= float_24_neon(samples
);
291 int32x4_t shifted
= vshlq_n_s32(converted
, 8);
292 shifted
= vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(shifted
)));
296 vst1q_s32((int32_t*)dst
, shifted
);
299 vst1q_lane_s32((int32_t*)(dst
), shifted
, 0);
300 vst1q_lane_s32((int32_t*)(dst
+dst_skip
), shifted
, 1);
301 vst1q_lane_s32((int32_t*)(dst
+2*dst_skip
), shifted
, 2);
302 vst1q_lane_s32((int32_t*)(dst
+3*dst_skip
), shifted
, 3);
314 float_24u32 (*src
, z
);
316 #if __BYTE_ORDER == __LITTLE_ENDIAN
317 dst
[0]=(char)(z
>>24);
318 dst
[1]=(char)(z
>>16);
321 #elif __BYTE_ORDER == __BIG_ENDIAN
324 dst
[2]=(char)(z
>>16);
325 dst
[3]=(char)(z
>>24);
332 void sample_move_d32u24_sS (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
334 #if defined (__SSE2__) && !defined (__sun__)
335 __m128 int_max
= _mm_set1_ps(SAMPLE_24BIT_MAX_F
);
336 __m128 int_min
= _mm_sub_ps(_mm_setzero_ps(), int_max
);
337 __m128 factor
= int_max
;
339 unsigned long unrolled
= nsamples
/ 4;
340 nsamples
= nsamples
& 3;
343 __m128 in
= _mm_load_ps(src
);
344 __m128 scaled
= _mm_mul_ps(in
, factor
);
345 __m128 clipped
= clip(scaled
, int_min
, int_max
);
347 __m128i y
= _mm_cvttps_epi32(clipped
);
348 __m128i shifted
= _mm_slli_epi32(y
, 8);
351 *(int32_t*)dst
= _mm_extract_epi32(shifted
, 0);
352 *(int32_t*)(dst
+dst_skip
) = _mm_extract_epi32(shifted
, 1);
353 *(int32_t*)(dst
+2*dst_skip
) = _mm_extract_epi32(shifted
, 2);
354 *(int32_t*)(dst
+3*dst_skip
) = _mm_extract_epi32(shifted
, 3);
356 __m128i shuffled1
= _mm_shuffle_epi32(shifted
, _MM_SHUFFLE(0, 3, 2, 1));
357 __m128i shuffled2
= _mm_shuffle_epi32(shifted
, _MM_SHUFFLE(1, 0, 3, 2));
358 __m128i shuffled3
= _mm_shuffle_epi32(shifted
, _MM_SHUFFLE(2, 1, 0, 3));
360 _mm_store_ss((float*)dst
, (__m128
)shifted
);
362 _mm_store_ss((float*)(dst
+dst_skip
), (__m128
)shuffled1
);
363 _mm_store_ss((float*)(dst
+2*dst_skip
), (__m128
)shuffled2
);
364 _mm_store_ss((float*)(dst
+3*dst_skip
), (__m128
)shuffled3
);
372 __m128 in
= _mm_load_ss(src
);
373 __m128 scaled
= _mm_mul_ss(in
, factor
);
374 __m128 clipped
= _mm_min_ss(int_max
, _mm_max_ss(scaled
, int_min
));
376 int y
= _mm_cvttss_si32(clipped
);
377 *((int *) dst
) = y
<<8;
383 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
384 unsigned long unrolled
= nsamples
/ 4;
385 nsamples
= nsamples
& 3;
388 float32x4_t samples
= vld1q_f32(src
);
389 int32x4_t converted
= float_24_neon(samples
);
390 int32x4_t shifted
= vshlq_n_s32(converted
, 8);
394 vst1q_s32((int32_t*)dst
, shifted
);
397 vst1q_lane_s32((int32_t*)(dst
), shifted
, 0);
398 vst1q_lane_s32((int32_t*)(dst
+dst_skip
), shifted
, 1);
399 vst1q_lane_s32((int32_t*)(dst
+2*dst_skip
), shifted
, 2);
400 vst1q_lane_s32((int32_t*)(dst
+3*dst_skip
), shifted
, 3);
409 #if !defined (__SSE2__)
411 float_24u32 (*src
, *((int32_t*) dst
));
418 void sample_move_dS_s32u24s (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
)
420 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
421 float32x4_t factor
= vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING
);
422 unsigned long unrolled
= nsamples
/ 4;
428 src128
= vld1q_s32((int32_t*)src
);
431 src128
= vld2q_s32((int32_t*)src
).val
[0];
434 src128
= vld1q_lane_s32((int32_t*)src
, src128
, 0);
435 src128
= vld1q_lane_s32((int32_t*)(src
+src_skip
), src128
, 1);
436 src128
= vld1q_lane_s32((int32_t*)(src
+2*src_skip
), src128
, 2);
437 src128
= vld1q_lane_s32((int32_t*)(src
+3*src_skip
), src128
, 3);
440 src128
= vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(src128
)));
441 int32x4_t shifted
= vshrq_n_s32(src128
, 8);
442 float32x4_t as_float
= vcvtq_f32_s32(shifted
);
443 float32x4_t divided
= vmulq_f32(as_float
, factor
);
444 vst1q_f32(dst
, divided
);
449 nsamples
= nsamples
& 3;
452 /* ALERT: signed sign-extension portability !!! */
454 const jack_default_audio_sample_t scaling
= 1.0/SAMPLE_24BIT_SCALING
;
458 #if __BYTE_ORDER == __LITTLE_ENDIAN
459 x
= (unsigned char)(src
[0]);
461 x
|= (unsigned char)(src
[1]);
463 x
|= (unsigned char)(src
[2]);
465 x
|= (unsigned char)(src
[3]);
466 #elif __BYTE_ORDER == __BIG_ENDIAN
467 x
= (unsigned char)(src
[3]);
469 x
|= (unsigned char)(src
[2]);
471 x
|= (unsigned char)(src
[1]);
473 x
|= (unsigned char)(src
[0]);
475 *dst
= (x
>> 8) * scaling
;
481 void sample_move_dS_s32u24 (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
)
483 #if defined (__SSE2__) && !defined (__sun__)
484 unsigned long unrolled
= nsamples
/ 4;
485 static float inv_sample_max_24bit
= 1.0 / SAMPLE_24BIT_SCALING
;
486 __m128 factor
= _mm_set1_ps(inv_sample_max_24bit
);
489 int i1
= *((int *) src
);
491 int i2
= *((int *) src
);
493 int i3
= *((int *) src
);
495 int i4
= *((int *) src
);
498 __m128i src
= _mm_set_epi32(i4
, i3
, i2
, i1
);
499 __m128i shifted
= _mm_srai_epi32(src
, 8);
501 __m128 as_float
= _mm_cvtepi32_ps(shifted
);
502 __m128 divided
= _mm_mul_ps(as_float
, factor
);
504 _mm_storeu_ps(dst
, divided
);
508 nsamples
= nsamples
& 3;
509 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
510 unsigned long unrolled
= nsamples
/ 4;
511 float32x4_t factor
= vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING
);
516 src128
= vld1q_s32((int32_t*)src
);
519 src128
= vld2q_s32((int32_t*)src
).val
[0];
522 src128
= vld1q_lane_s32((int32_t*)src
, src128
, 0);
523 src128
= vld1q_lane_s32((int32_t*)(src
+src_skip
), src128
, 1);
524 src128
= vld1q_lane_s32((int32_t*)(src
+2*src_skip
), src128
, 2);
525 src128
= vld1q_lane_s32((int32_t*)(src
+3*src_skip
), src128
, 3);
528 int32x4_t shifted
= vshrq_n_s32(src128
, 8);
529 float32x4_t as_float
= vcvtq_f32_s32(shifted
);
530 float32x4_t divided
= vmulq_f32(as_float
, factor
);
531 vst1q_f32(dst
, divided
);
536 nsamples
= nsamples
& 3;
539 /* ALERT: signed sign-extension portability !!! */
541 const jack_default_audio_sample_t scaling
= 1.0/SAMPLE_24BIT_SCALING
;
543 *dst
= (*((int *) src
) >> 8) * scaling
;
549 void sample_move_d24_sSs (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
551 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
552 unsigned long unrolled
= nsamples
/ 4;
556 float32x4_t samples
= vld1q_f32(src
);
557 int32x4_t converted
= float_24_neon(samples
);
558 converted
= vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted
)));
559 vst1q_s32(z
, converted
);
561 for (i
= 0; i
!= 4; ++i
) {
562 memcpy (dst
, ((char*)(z
+i
))+1, 3);
567 nsamples
= nsamples
& 3;
574 #if __BYTE_ORDER == __LITTLE_ENDIAN
575 dst
[0]=(char)(z
>>16);
578 #elif __BYTE_ORDER == __BIG_ENDIAN
581 dst
[2]=(char)(z
>>16);
588 void sample_move_d24_sS (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
590 #if defined (__SSE2__) && !defined (__sun__)
591 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST
);
592 while (nsamples
>= 4) {
595 __m128 samples
= _mm_loadu_ps(src
);
596 __m128i converted
= float_24_sse(samples
);
599 z
[0] = _mm_extract_epi32(converted
, 0);
600 z
[1] = _mm_extract_epi32(converted
, 1);
601 z
[2] = _mm_extract_epi32(converted
, 2);
602 z
[3] = _mm_extract_epi32(converted
, 3);
604 __m128i shuffled1
= _mm_shuffle_epi32(converted
, _MM_SHUFFLE(0, 3, 2, 1));
605 __m128i shuffled2
= _mm_shuffle_epi32(converted
, _MM_SHUFFLE(1, 0, 3, 2));
606 __m128i shuffled3
= _mm_shuffle_epi32(converted
, _MM_SHUFFLE(2, 1, 0, 3));
608 _mm_store_ss((float*)z
, (__m128
)converted
);
609 _mm_store_ss((float*)z
+1, (__m128
)shuffled1
);
610 _mm_store_ss((float*)z
+2, (__m128
)shuffled2
);
611 _mm_store_ss((float*)z
+3, (__m128
)shuffled3
);
614 for (i
= 0; i
!= 4; ++i
) {
615 memcpy (dst
, z
+i
, 3);
622 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
623 unsigned long unrolled
= nsamples
/ 4;
627 float32x4_t samples
= vld1q_f32(src
);
628 int32x4_t converted
= float_24_neon(samples
);
629 vst1q_s32(z
, converted
);
631 for (i
= 0; i
!= 4; ++i
) {
632 memcpy (dst
, z
+i
, 3);
637 nsamples
= nsamples
& 3;
644 #if __BYTE_ORDER == __LITTLE_ENDIAN
646 #elif __BYTE_ORDER == __BIG_ENDIAN
647 memcpy (dst
, (char *)&z
+ 1, 3);
654 void sample_move_dS_s24s (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
)
656 const jack_default_audio_sample_t scaling
= 1.0/SAMPLE_24BIT_SCALING
;
658 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
659 // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
660 const float32x4_t vscaling
= vdupq_n_f32(scaling
/256.0);
662 memset(x
, 0, sizeof(x
));
663 unsigned long unrolled
= nsamples
/ 4;
665 #if __BYTE_ORDER == __BIG_ENDIAN /* ARM big endian?? */
666 // right aligned / inverse sequence below -> *256
667 memcpy(((char*)&x
[0])+1, src
, 3);
668 memcpy(((char*)&x
[1])+1, src
+src_skip
, 3);
669 memcpy(((char*)&x
[2])+1, src
+2*src_skip
, 3);
670 memcpy(((char*)&x
[3])+1, src
+3*src_skip
, 3);
672 memcpy(&x
[0], src
, 3);
673 memcpy(&x
[1], src
+src_skip
, 3);
674 memcpy(&x
[2], src
+2*src_skip
, 3);
675 memcpy(&x
[3], src
+3*src_skip
, 3);
679 int32x4_t source
= vld1q_s32(x
);
680 source
= vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(source
)));
681 float32x4_t converted
= vcvtq_f32_s32(source
);
682 float32x4_t scaled
= vmulq_f32(converted
, vscaling
);
683 vst1q_f32(dst
, scaled
);
686 nsamples
= nsamples
& 3;
689 /* ALERT: signed sign-extension portability !!! */
693 #if __BYTE_ORDER == __LITTLE_ENDIAN
694 x
= (unsigned char)(src
[0]);
696 x
|= (unsigned char)(src
[1]);
698 x
|= (unsigned char)(src
[2]);
699 /* correct sign bit and the rest of the top byte */
703 #elif __BYTE_ORDER == __BIG_ENDIAN
704 x
= (unsigned char)(src
[2]);
706 x
|= (unsigned char)(src
[1]);
708 x
|= (unsigned char)(src
[0]);
709 /* correct sign bit and the rest of the top byte */
720 void sample_move_dS_s24 (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
)
722 const jack_default_audio_sample_t scaling
= 1.f
/SAMPLE_24BIT_SCALING
;
724 #if defined (__SSE2__) && !defined (__sun__)
725 const __m128 scaling_block
= _mm_set_ps1(scaling
);
726 while (nsamples
>= 4) {
729 memcpy((char*)&x0
+ 1, src
, 3);
730 memcpy((char*)&x1
+ 1, src
+src_skip
, 3);
731 memcpy((char*)&x2
+ 1, src
+2*src_skip
, 3);
732 memcpy((char*)&x3
+ 1, src
+3*src_skip
, 3);
735 const __m128i block_i
= _mm_set_epi32(x3
, x2
, x1
, x0
);
736 const __m128i shifted
= _mm_srai_epi32(block_i
, 8);
737 const __m128 converted
= _mm_cvtepi32_ps (shifted
);
738 const __m128 scaled
= _mm_mul_ps(converted
, scaling_block
);
739 _mm_storeu_ps(dst
, scaled
);
743 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
744 // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
745 const float32x4_t vscaling
= vdupq_n_f32(scaling
/256.0);
747 memset(x
, 0, sizeof(x
));
748 unsigned long unrolled
= nsamples
/ 4;
750 #if __BYTE_ORDER == __BIG_ENDIAN /* ARM big endian?? */
751 // left aligned -> *256
752 memcpy(&x
[0], src
, 3);
753 memcpy(&x
[1], src
+src_skip
, 3);
754 memcpy(&x
[2], src
+2*src_skip
, 3);
755 memcpy(&x
[3], src
+3*src_skip
, 3);
757 memcpy(((char*)&x
[0])+1, src
, 3);
758 memcpy(((char*)&x
[1])+1, src
+src_skip
, 3);
759 memcpy(((char*)&x
[2])+1, src
+2*src_skip
, 3);
760 memcpy(((char*)&x
[3])+1, src
+3*src_skip
, 3);
764 int32x4_t source
= vld1q_s32(x
);
765 float32x4_t converted
= vcvtq_f32_s32(source
);
766 float32x4_t scaled
= vmulq_f32(converted
, vscaling
);
767 vst1q_f32(dst
, scaled
);
770 nsamples
= nsamples
& 3;
775 #if __BYTE_ORDER == __LITTLE_ENDIAN
776 memcpy((char*)&x
+ 1, src
, 3);
777 #elif __BYTE_ORDER == __BIG_ENDIAN
788 void sample_move_d16_sSs (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
790 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
791 unsigned long unrolled
= nsamples
/ 4;
792 nsamples
= nsamples
& 3;
795 float32x4_t samples
= vld1q_f32(src
);
796 int16x4_t converted
= float_16_neon(samples
);
797 converted
= vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(converted
)));
801 vst1_s16((int16_t*)dst
, converted
);
804 vst1_lane_s16((int16_t*)(dst
), converted
, 0);
805 vst1_lane_s16((int16_t*)(dst
+dst_skip
), converted
, 1);
806 vst1_lane_s16((int16_t*)(dst
+2*dst_skip
), converted
, 2);
807 vst1_lane_s16((int16_t*)(dst
+3*dst_skip
), converted
, 3);
817 // float_16 (*src, tmp);
819 if (*src
<= NORMALIZED_FLOAT_MIN
) {
820 tmp
= SAMPLE_16BIT_MIN
;
821 } else if (*src
>= NORMALIZED_FLOAT_MAX
) {
822 tmp
= SAMPLE_16BIT_MAX
;
824 tmp
= (int16_t) f_round (*src
* SAMPLE_16BIT_SCALING
);
827 #if __BYTE_ORDER == __LITTLE_ENDIAN
828 dst
[0]=(char)(tmp
>>8);
830 #elif __BYTE_ORDER == __BIG_ENDIAN
832 dst
[1]=(char)(tmp
>>8);
839 void sample_move_d16_sS (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
841 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
842 unsigned long unrolled
= nsamples
/ 4;
843 nsamples
= nsamples
& 3;
846 float32x4_t samples
= vld1q_f32(src
);
847 int16x4_t converted
= float_16_neon(samples
);
851 vst1_s16((int16_t*)dst
, converted
);
854 vst1_lane_s16((int16_t*)(dst
), converted
, 0);
855 vst1_lane_s16((int16_t*)(dst
+dst_skip
), converted
, 1);
856 vst1_lane_s16((int16_t*)(dst
+2*dst_skip
), converted
, 2);
857 vst1_lane_s16((int16_t*)(dst
+3*dst_skip
), converted
, 3);
865 float_16 (*src
, *((int16_t*) dst
));
871 void sample_move_dither_rect_d16_sSs (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
873 jack_default_audio_sample_t val
;
877 val
= (*src
* SAMPLE_16BIT_SCALING
) + fast_rand() / (float) UINT_MAX
- 0.5f
;
878 float_16_scaled (val
, tmp
);
879 #if __BYTE_ORDER == __LITTLE_ENDIAN
880 dst
[0]=(char)(tmp
>>8);
882 #elif __BYTE_ORDER == __BIG_ENDIAN
884 dst
[1]=(char)(tmp
>>8);
891 void sample_move_dither_rect_d16_sS (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
893 jack_default_audio_sample_t val
;
896 val
= (*src
* SAMPLE_16BIT_SCALING
) + fast_rand() / (float)UINT_MAX
- 0.5f
;
897 float_16_scaled (val
, *((int16_t*) dst
));
903 void sample_move_dither_tri_d16_sSs (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
905 jack_default_audio_sample_t val
;
909 val
= (*src
* SAMPLE_16BIT_SCALING
) + ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX
- 1.0f
;
910 float_16_scaled (val
, tmp
);
912 #if __BYTE_ORDER == __LITTLE_ENDIAN
913 dst
[0]=(char)(tmp
>>8);
915 #elif __BYTE_ORDER == __BIG_ENDIAN
917 dst
[1]=(char)(tmp
>>8);
924 void sample_move_dither_tri_d16_sS (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
926 jack_default_audio_sample_t val
;
929 val
= (*src
* SAMPLE_16BIT_SCALING
) + ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX
- 1.0f
;
930 float_16_scaled (val
, *((int16_t*) dst
));
936 void sample_move_dither_shaped_d16_sSs (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
938 jack_default_audio_sample_t x
;
939 jack_default_audio_sample_t xe
; /* the innput sample - filtered error */
940 jack_default_audio_sample_t xp
; /* x' */
942 float rm1
= state
->rm1
;
943 unsigned int idx
= state
->idx
;
947 x
= *src
* SAMPLE_16BIT_SCALING
;
948 r
= ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX
- 1.0f
;
949 /* Filter the error with Lipshitz's minimally audible FIR:
950 [2.033 -2.165 1.959 -1.590 0.6149] */
952 - state
->e
[idx
] * 2.033f
953 + state
->e
[(idx
- 1) & DITHER_BUF_MASK
] * 2.165f
954 - state
->e
[(idx
- 2) & DITHER_BUF_MASK
] * 1.959f
955 + state
->e
[(idx
- 3) & DITHER_BUF_MASK
] * 1.590f
956 - state
->e
[(idx
- 4) & DITHER_BUF_MASK
] * 0.6149f
;
960 float_16_scaled (xp
, tmp
);
962 /* Intrinsic z^-1 delay */
963 idx
= (idx
+ 1) & DITHER_BUF_MASK
;
964 state
->e
[idx
] = xp
- xe
;
966 #if __BYTE_ORDER == __LITTLE_ENDIAN
967 dst
[0]=(char)(tmp
>>8);
969 #elif __BYTE_ORDER == __BIG_ENDIAN
971 dst
[1]=(char)(tmp
>>8);
980 void sample_move_dither_shaped_d16_sS (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
982 jack_default_audio_sample_t x
;
983 jack_default_audio_sample_t xe
; /* the innput sample - filtered error */
984 jack_default_audio_sample_t xp
; /* x' */
986 float rm1
= state
->rm1
;
987 unsigned int idx
= state
->idx
;
990 x
= *src
* SAMPLE_16BIT_SCALING
;
991 r
= ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX
- 1.0f
;
992 /* Filter the error with Lipshitz's minimally audible FIR:
993 [2.033 -2.165 1.959 -1.590 0.6149] */
995 - state
->e
[idx
] * 2.033f
996 + state
->e
[(idx
- 1) & DITHER_BUF_MASK
] * 2.165f
997 - state
->e
[(idx
- 2) & DITHER_BUF_MASK
] * 1.959f
998 + state
->e
[(idx
- 3) & DITHER_BUF_MASK
] * 1.590f
999 - state
->e
[(idx
- 4) & DITHER_BUF_MASK
] * 0.6149f
;
1003 float_16_scaled (xp
, *((int16_t*) dst
));
1005 /* Intrinsic z^-1 delay */
1006 idx
= (idx
+ 1) & DITHER_BUF_MASK
;
1007 state
->e
[idx
] = *((int16_t*) dst
) - xe
;
1016 void sample_move_dS_s16s (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
)
1019 const jack_default_audio_sample_t scaling
= 1.0/SAMPLE_16BIT_SCALING
;
1020 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1021 const float32x4_t vscaling
= vdupq_n_f32(scaling
);
1022 unsigned long unrolled
= nsamples
/ 4;
1023 while (unrolled
--) {
1024 int16x4_t source16x4
;
1027 source16x4
= vld1_s16((int16_t*)src
);
1030 source16x4
= vld2_s16((int16_t*)src
).val
[0];
1033 source16x4
= vld1_lane_s16((int16_t*)src
, source16x4
, 0);
1034 source16x4
= vld1_lane_s16((int16_t*)(src
+src_skip
), source16x4
, 1);
1035 source16x4
= vld1_lane_s16((int16_t*)(src
+2*src_skip
), source16x4
, 2);
1036 source16x4
= vld1_lane_s16((int16_t*)(src
+3*src_skip
), source16x4
, 3);
1039 source16x4
= vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(source16x4
)));
1040 int32x4_t source32x4
= vmovl_s16(source16x4
);
1041 src
+= 4 * src_skip
;
1043 float32x4_t converted
= vcvtq_f32_s32(source32x4
);
1044 float32x4_t scaled
= vmulq_f32(converted
, vscaling
);
1045 vst1q_f32(dst
, scaled
);
1048 nsamples
= nsamples
& 3;
1051 /* ALERT: signed sign-extension portability !!! */
1052 while (nsamples
--) {
1053 #if __BYTE_ORDER == __LITTLE_ENDIAN
1054 z
= (unsigned char)(src
[0]);
1056 z
|= (unsigned char)(src
[1]);
1057 #elif __BYTE_ORDER == __BIG_ENDIAN
1058 z
= (unsigned char)(src
[1]);
1060 z
|= (unsigned char)(src
[0]);
1068 void sample_move_dS_s16 (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
)
1070 /* ALERT: signed sign-extension portability !!! */
1071 const jack_default_audio_sample_t scaling
= 1.0/SAMPLE_16BIT_SCALING
;
1072 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1073 const float32x4_t vscaling
= vdupq_n_f32(scaling
);
1074 unsigned long unrolled
= nsamples
/ 4;
1075 while (unrolled
--) {
1076 int16x4_t source16x4
;
1079 source16x4
= vld1_s16((int16_t*)src
);
1082 source16x4
= vld2_s16((int16_t*)src
).val
[0];
1085 source16x4
= vld1_lane_s16((int16_t*)src
, source16x4
, 0);
1086 source16x4
= vld1_lane_s16((int16_t*)(src
+src_skip
), source16x4
, 1);
1087 source16x4
= vld1_lane_s16((int16_t*)(src
+2*src_skip
), source16x4
, 2);
1088 source16x4
= vld1_lane_s16((int16_t*)(src
+3*src_skip
), source16x4
, 3);
1091 int32x4_t source32x4
= vmovl_s16(source16x4
);
1092 src
+= 4 * src_skip
;
1094 float32x4_t converted
= vcvtq_f32_s32(source32x4
);
1095 float32x4_t scaled
= vmulq_f32(converted
, vscaling
);
1096 vst1q_f32(dst
, scaled
);
1099 nsamples
= nsamples
& 3;
1102 while (nsamples
--) {
1103 *dst
= (*((short *) src
)) * scaling
;
1109 void memset_interleave (char *dst
, char val
, unsigned long bytes
,
1110 unsigned long unit_bytes
,
1111 unsigned long skip_bytes
)
1113 switch (unit_bytes
) {
1122 *((short *) dst
) = (short) val
;
1129 *((int *) dst
) = (int) val
;
1136 memset(dst
, val
, unit_bytes
);
1138 bytes
-= unit_bytes
;
1144 /* COPY FUNCTIONS: used to move data from an input channel to an
1145 output channel. Note that we assume that the skip distance
1146 is the same for both channels. This is completely fine
1147 unless the input and output were on different audio interfaces that
1148 were interleaved differently. We don't try to handle that.
1152 memcpy_fake (char *dst
, char *src
, unsigned long src_bytes
, unsigned long foo
, unsigned long bar
)
1154 memcpy (dst
, src
, src_bytes
);
1158 memcpy_interleave_d16_s16 (char *dst
, char *src
, unsigned long src_bytes
,
1159 unsigned long dst_skip_bytes
, unsigned long src_skip_bytes
)
1162 *((short *) dst
) = *((short *) src
);
1163 dst
+= dst_skip_bytes
;
1164 src
+= src_skip_bytes
;
1170 memcpy_interleave_d24_s24 (char *dst
, char *src
, unsigned long src_bytes
,
1171 unsigned long dst_skip_bytes
, unsigned long src_skip_bytes
)
1174 memcpy(dst
, src
, 3);
1175 dst
+= dst_skip_bytes
;
1176 src
+= src_skip_bytes
;
1182 memcpy_interleave_d32_s32 (char *dst
, char *src
, unsigned long src_bytes
,
1183 unsigned long dst_skip_bytes
, unsigned long src_skip_bytes
)
1186 *((int *) dst
) = *((int *) src
);
1187 dst
+= dst_skip_bytes
;
1188 src
+= src_skip_bytes
;