Merge branch 'master' into develop
[jack2.git] / common / memops.c
blob6c5ad2f9b76baff76033a896e7fba1f8a5c293c4
1 /*
2 Copyright (C) 2000 Paul Davis
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 #define _ISOC9X_SOURCE 1
21 #define _ISOC99_SOURCE 1
23 #define __USE_ISOC9X 1
24 #define __USE_ISOC99 1
26 #include <stdio.h>
27 #include <string.h>
28 #include <math.h>
29 #include <memory.h>
30 #include <stdlib.h>
31 #include <stdint.h>
32 #include <limits.h>
33 #ifdef __linux__
34 #include <endian.h>
35 #endif
36 #include "memops.h"
38 #if defined (__SSE2__) && !defined (__sun__)
39 #include <emmintrin.h>
40 #ifdef __SSE4_1__
41 #include <smmintrin.h>
42 #endif
43 #endif
45 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
46 #include <arm_neon.h>
47 #endif
49 /* Notes about these *_SCALING values.
51 the MAX_<N>BIT values are floating point. when multiplied by
52 a full-scale normalized floating point sample value (-1.0..+1.0)
53 they should give the maximum value representable with an integer
54 sample type of N bits. Note that this is asymmetric. Sample ranges
55 for signed integer, 2's complement values are -(2^(N-1) to +(2^(N-1)-1)
57 Complications
58 -------------
59 If we use +2^(N-1) for the scaling factors, we run into a problem:
61 if we start with a normalized float value of -1.0, scaling
62 to 24 bits would give -8388608 (-2^23), which is ideal.
63 But with +1.0, we get +8388608, which is technically out of range.
65 We never multiply a full range normalized value by this constant,
66 but we could multiply it by a positive value that is close enough to +1.0
67 to produce a value > +(2^(N-1)-1.
69 There is no way around this paradox without wasting CPU cycles to determine
70 which scaling factor to use (i.e. determine if its negative or not,
71 use the right factor).
73 So, for now (October 2008) we use 2^(N-1)-1 as the scaling factor.
76 #define SAMPLE_24BIT_SCALING 8388607.0f
77 #define SAMPLE_16BIT_SCALING 32767.0f
79 /* these are just values to use if the floating point value was out of range
81 advice from Fons Adriaensen: make the limits symmetrical
84 #define SAMPLE_24BIT_MAX 8388607
85 #define SAMPLE_24BIT_MIN -8388607
86 #define SAMPLE_24BIT_MAX_F 8388607.0f
87 #define SAMPLE_24BIT_MIN_F -8388607.0f
89 #define SAMPLE_16BIT_MAX 32767
90 #define SAMPLE_16BIT_MIN -32767
91 #define SAMPLE_16BIT_MAX_F 32767.0f
92 #define SAMPLE_16BIT_MIN_F -32767.0f
94 /* these mark the outer edges of the range considered "within" range
95 for a floating point sample value. values outside (and on the boundaries)
96 of this range will be clipped before conversion; values within this
97 range will be scaled to appropriate values for the target sample
98 type.
101 #define NORMALIZED_FLOAT_MIN -1.0f
102 #define NORMALIZED_FLOAT_MAX 1.0f
104 /* define this in case we end up on a platform that is missing
105 the real lrintf functions
108 #define f_round(f) lrintf(f)
110 #define float_16(s, d)\
111 if ((s) <= NORMALIZED_FLOAT_MIN) {\
112 (d) = SAMPLE_16BIT_MIN;\
113 } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
114 (d) = SAMPLE_16BIT_MAX;\
115 } else {\
116 (d) = f_round ((s) * SAMPLE_16BIT_SCALING);\
119 /* call this when "s" has already been scaled (e.g. when dithering)
122 #define float_16_scaled(s, d)\
123 if ((s) <= SAMPLE_16BIT_MIN_F) {\
124 (d) = SAMPLE_16BIT_MIN_F;\
125 } else if ((s) >= SAMPLE_16BIT_MAX_F) { \
126 (d) = SAMPLE_16BIT_MAX;\
127 } else {\
128 (d) = f_round ((s));\
131 #define float_24u32(s, d) \
132 if ((s) <= NORMALIZED_FLOAT_MIN) {\
133 (d) = SAMPLE_24BIT_MIN << 8;\
134 } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
135 (d) = SAMPLE_24BIT_MAX << 8;\
136 } else {\
137 (d) = f_round ((s) * SAMPLE_24BIT_SCALING) << 8;\
140 /* call this when "s" has already been scaled (e.g. when dithering)
143 #define float_24u32_scaled(s, d)\
144 if ((s) <= SAMPLE_24BIT_MIN_F) {\
145 (d) = SAMPLE_24BIT_MIN << 8;\
146 } else if ((s) >= SAMPLE_24BIT_MAX_F) { \
147 (d) = SAMPLE_24BIT_MAX << 8; \
148 } else {\
149 (d) = f_round ((s)) << 8; \
152 #define float_24(s, d) \
153 if ((s) <= NORMALIZED_FLOAT_MIN) {\
154 (d) = SAMPLE_24BIT_MIN;\
155 } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
156 (d) = SAMPLE_24BIT_MAX;\
157 } else {\
158 (d) = f_round ((s) * SAMPLE_24BIT_SCALING);\
161 /* call this when "s" has already been scaled (e.g. when dithering)
164 #define float_24_scaled(s, d)\
165 if ((s) <= SAMPLE_24BIT_MIN_F) {\
166 (d) = SAMPLE_24BIT_MIN;\
167 } else if ((s) >= SAMPLE_24BIT_MAX_F) { \
168 (d) = SAMPLE_24BIT_MAX; \
169 } else {\
170 (d) = f_round ((s)); \
174 #if defined (__SSE2__) && !defined (__sun__)
176 /* generates same as _mm_set_ps(1.f, 1.f, 1f., 1f) but faster */
177 static inline __m128 gen_one(void)
179 volatile __m128i x = { 0 }; /* shut up, GCC */
180 __m128i ones = _mm_cmpeq_epi32(x, x);
181 return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 25), 23);
184 static inline __m128 clip(__m128 s, __m128 min, __m128 max)
186 return _mm_min_ps(max, _mm_max_ps(s, min));
189 static inline __m128i float_24_sse(__m128 s)
191 const __m128 upper_bound = gen_one(); /* NORMALIZED_FLOAT_MAX */
192 const __m128 lower_bound = _mm_sub_ps(_mm_setzero_ps(), upper_bound);
194 __m128 clipped = clip(s, lower_bound, upper_bound);
195 __m128 scaled = _mm_mul_ps(clipped, _mm_set1_ps(SAMPLE_24BIT_SCALING));
196 return _mm_cvtps_epi32(scaled);
198 #endif
201 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
203 static inline float32x4_t clip(float32x4_t s, float32x4_t min, float32x4_t max)
205 return vminq_f32(max, vmaxq_f32(s, min));
208 static inline int32x4_t float_24_neon(float32x4_t s)
210 const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX);
211 const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN);
213 float32x4_t clipped = clip(s, lower_bound, upper_bound);
214 float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_24BIT_SCALING));
215 return vcvtq_s32_f32(scaled);
218 static inline int16x4_t float_16_neon(float32x4_t s)
220 const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX);
221 const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN);
223 float32x4_t clipped = clip(s, lower_bound, upper_bound);
224 float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_16BIT_SCALING));
225 return vmovn_s32(vcvtq_s32_f32(scaled));
227 #endif
229 /* Linear Congruential noise generator. From the music-dsp list
230 * less random than rand(), but good enough and 10x faster
232 static unsigned int seed = 22222;
234 static inline unsigned int fast_rand() {
235 seed = (seed * 196314165) + 907633515;
236 return seed;
239 /* functions for native float sample data */
241 void sample_move_floatLE_sSs (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip) {
242 while (nsamples--) {
243 *dst = *((float *) src);
244 dst++;
245 src += src_skip;
249 void sample_move_dS_floatLE (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) {
250 while (nsamples--) {
251 *((float *) dst) = *src;
252 dst += dst_skip;
253 src++;
257 /* NOTES on function naming:
259 foo_bar_d<TYPE>_s<TYPE>
261 the "d<TYPE>" component defines the destination type for the operation
262 the "s<TYPE>" component defines the source type for the operation
264 TYPE can be one of:
266 S - sample is a jack_default_audio_sample_t, currently (October 2008) a 32 bit floating point value
267 Ss - like S but reverse endian from the host CPU
268 32u24 - sample is an signed 32 bit integer value, but data is in upper 24 bits only
269 32u24s - like 32u24 but reverse endian from the host CPU
270 24 - sample is an signed 24 bit integer value
271 24s - like 24 but reverse endian from the host CPU
272 16 - sample is an signed 16 bit integer value
273 16s - like 16 but reverse endian from the host CPU
275 For obvious reasons, the reverse endian versions only show as source types.
277 This covers all known sample formats at 16 bits or larger.
280 /* functions for native integer sample data */
282 void sample_move_d32u24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
284 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
285 unsigned long unrolled = nsamples / 4;
286 nsamples = nsamples & 3;
288 while (unrolled--) {
289 float32x4_t samples = vld1q_f32(src);
290 int32x4_t converted = float_24_neon(samples);
291 int32x4_t shifted = vshlq_n_s32(converted, 8);
292 shifted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(shifted)));
294 switch(dst_skip) {
295 case 4:
296 vst1q_s32((int32_t*)dst, shifted);
297 break;
298 default:
299 vst1q_lane_s32((int32_t*)(dst), shifted, 0);
300 vst1q_lane_s32((int32_t*)(dst+dst_skip), shifted, 1);
301 vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2);
302 vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3);
303 break;
305 dst += 4*dst_skip;
306 src+= 4;
308 #endif
310 int32_t z;
312 while (nsamples--) {
314 float_24u32 (*src, z);
316 #if __BYTE_ORDER == __LITTLE_ENDIAN
317 dst[0]=(char)(z>>24);
318 dst[1]=(char)(z>>16);
319 dst[2]=(char)(z>>8);
320 dst[3]=(char)(z);
321 #elif __BYTE_ORDER == __BIG_ENDIAN
322 dst[0]=(char)(z);
323 dst[1]=(char)(z>>8);
324 dst[2]=(char)(z>>16);
325 dst[3]=(char)(z>>24);
326 #endif
327 dst += dst_skip;
328 src++;
332 void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
334 #if defined (__SSE2__) && !defined (__sun__)
335 __m128 int_max = _mm_set1_ps(SAMPLE_24BIT_MAX_F);
336 __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
337 __m128 factor = int_max;
339 unsigned long unrolled = nsamples / 4;
340 nsamples = nsamples & 3;
342 while (unrolled--) {
343 __m128 in = _mm_load_ps(src);
344 __m128 scaled = _mm_mul_ps(in, factor);
345 __m128 clipped = clip(scaled, int_min, int_max);
347 __m128i y = _mm_cvttps_epi32(clipped);
348 __m128i shifted = _mm_slli_epi32(y, 8);
350 #ifdef __SSE4_1__
351 *(int32_t*)dst = _mm_extract_epi32(shifted, 0);
352 *(int32_t*)(dst+dst_skip) = _mm_extract_epi32(shifted, 1);
353 *(int32_t*)(dst+2*dst_skip) = _mm_extract_epi32(shifted, 2);
354 *(int32_t*)(dst+3*dst_skip) = _mm_extract_epi32(shifted, 3);
355 #else
356 __m128i shuffled1 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(0, 3, 2, 1));
357 __m128i shuffled2 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(1, 0, 3, 2));
358 __m128i shuffled3 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(2, 1, 0, 3));
360 _mm_store_ss((float*)dst, (__m128)shifted);
362 _mm_store_ss((float*)(dst+dst_skip), (__m128)shuffled1);
363 _mm_store_ss((float*)(dst+2*dst_skip), (__m128)shuffled2);
364 _mm_store_ss((float*)(dst+3*dst_skip), (__m128)shuffled3);
365 #endif
366 dst += 4*dst_skip;
368 src+= 4;
371 while (nsamples--) {
372 __m128 in = _mm_load_ss(src);
373 __m128 scaled = _mm_mul_ss(in, factor);
374 __m128 clipped = _mm_min_ss(int_max, _mm_max_ss(scaled, int_min));
376 int y = _mm_cvttss_si32(clipped);
377 *((int *) dst) = y<<8;
379 dst += dst_skip;
380 src++;
383 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
384 unsigned long unrolled = nsamples / 4;
385 nsamples = nsamples & 3;
387 while (unrolled--) {
388 float32x4_t samples = vld1q_f32(src);
389 int32x4_t converted = float_24_neon(samples);
390 int32x4_t shifted = vshlq_n_s32(converted, 8);
392 switch(dst_skip) {
393 case 4:
394 vst1q_s32((int32_t*)dst, shifted);
395 break;
396 default:
397 vst1q_lane_s32((int32_t*)(dst), shifted, 0);
398 vst1q_lane_s32((int32_t*)(dst+dst_skip), shifted, 1);
399 vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2);
400 vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3);
401 break;
403 dst += 4*dst_skip;
405 src+= 4;
407 #endif
409 #if !defined (__SSE2__)
410 while (nsamples--) {
411 float_24u32 (*src, *((int32_t*) dst));
412 dst += dst_skip;
413 src++;
415 #endif
418 void sample_move_dS_s32u24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
420 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
421 float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
422 unsigned long unrolled = nsamples / 4;
423 while (unrolled--) {
424 int32x4_t src128;
425 switch(src_skip)
427 case 4:
428 src128 = vld1q_s32((int32_t*)src);
429 break;
430 case 8:
431 src128 = vld2q_s32((int32_t*)src).val[0];
432 break;
433 default:
434 src128 = vld1q_lane_s32((int32_t*)src, src128, 0);
435 src128 = vld1q_lane_s32((int32_t*)(src+src_skip), src128, 1);
436 src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2);
437 src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3);
438 break;
440 src128 = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(src128)));
441 int32x4_t shifted = vshrq_n_s32(src128, 8);
442 float32x4_t as_float = vcvtq_f32_s32(shifted);
443 float32x4_t divided = vmulq_f32(as_float, factor);
444 vst1q_f32(dst, divided);
446 src += 4*src_skip;
447 dst += 4;
449 nsamples = nsamples & 3;
450 #endif
452 /* ALERT: signed sign-extension portability !!! */
454 const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
456 while (nsamples--) {
457 int x;
458 #if __BYTE_ORDER == __LITTLE_ENDIAN
459 x = (unsigned char)(src[0]);
460 x <<= 8;
461 x |= (unsigned char)(src[1]);
462 x <<= 8;
463 x |= (unsigned char)(src[2]);
464 x <<= 8;
465 x |= (unsigned char)(src[3]);
466 #elif __BYTE_ORDER == __BIG_ENDIAN
467 x = (unsigned char)(src[3]);
468 x <<= 8;
469 x |= (unsigned char)(src[2]);
470 x <<= 8;
471 x |= (unsigned char)(src[1]);
472 x <<= 8;
473 x |= (unsigned char)(src[0]);
474 #endif
475 *dst = (x >> 8) * scaling;
476 dst++;
477 src += src_skip;
481 void sample_move_dS_s32u24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
483 #if defined (__SSE2__) && !defined (__sun__)
484 unsigned long unrolled = nsamples / 4;
485 static float inv_sample_max_24bit = 1.0 / SAMPLE_24BIT_SCALING;
486 __m128 factor = _mm_set1_ps(inv_sample_max_24bit);
487 while (unrolled--)
489 int i1 = *((int *) src);
490 src+= src_skip;
491 int i2 = *((int *) src);
492 src+= src_skip;
493 int i3 = *((int *) src);
494 src+= src_skip;
495 int i4 = *((int *) src);
496 src+= src_skip;
498 __m128i src = _mm_set_epi32(i4, i3, i2, i1);
499 __m128i shifted = _mm_srai_epi32(src, 8);
501 __m128 as_float = _mm_cvtepi32_ps(shifted);
502 __m128 divided = _mm_mul_ps(as_float, factor);
504 _mm_storeu_ps(dst, divided);
506 dst += 4;
508 nsamples = nsamples & 3;
509 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
510 unsigned long unrolled = nsamples / 4;
511 float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
512 while (unrolled--) {
513 int32x4_t src128;
514 switch(src_skip) {
515 case 4:
516 src128 = vld1q_s32((int32_t*)src);
517 break;
518 case 8:
519 src128 = vld2q_s32((int32_t*)src).val[0];
520 break;
521 default:
522 src128 = vld1q_lane_s32((int32_t*)src, src128, 0);
523 src128 = vld1q_lane_s32((int32_t*)(src+src_skip), src128, 1);
524 src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2);
525 src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3);
526 break;
528 int32x4_t shifted = vshrq_n_s32(src128, 8);
529 float32x4_t as_float = vcvtq_f32_s32(shifted);
530 float32x4_t divided = vmulq_f32(as_float, factor);
531 vst1q_f32(dst, divided);
533 src += 4*src_skip;
534 dst += 4;
536 nsamples = nsamples & 3;
537 #endif
539 /* ALERT: signed sign-extension portability !!! */
541 const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
542 while (nsamples--) {
543 *dst = (*((int *) src) >> 8) * scaling;
544 dst++;
545 src += src_skip;
549 void sample_move_d24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
551 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
552 unsigned long unrolled = nsamples / 4;
553 while (unrolled--) {
554 int i;
555 int32_t z[4];
556 float32x4_t samples = vld1q_f32(src);
557 int32x4_t converted = float_24_neon(samples);
558 converted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted)));
559 vst1q_s32(z, converted);
561 for (i = 0; i != 4; ++i) {
562 memcpy (dst, ((char*)(z+i))+1, 3);
563 dst += dst_skip;
565 src += 4;
567 nsamples = nsamples & 3;
568 #endif
570 int32_t z;
572 while (nsamples--) {
573 float_24 (*src, z);
574 #if __BYTE_ORDER == __LITTLE_ENDIAN
575 dst[0]=(char)(z>>16);
576 dst[1]=(char)(z>>8);
577 dst[2]=(char)(z);
578 #elif __BYTE_ORDER == __BIG_ENDIAN
579 dst[0]=(char)(z);
580 dst[1]=(char)(z>>8);
581 dst[2]=(char)(z>>16);
582 #endif
583 dst += dst_skip;
584 src++;
588 void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
590 #if defined (__SSE2__) && !defined (__sun__)
591 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
592 while (nsamples >= 4) {
593 int i;
594 int32_t z[4];
595 __m128 samples = _mm_loadu_ps(src);
596 __m128i converted = float_24_sse(samples);
598 #ifdef __SSE4_1__
599 z[0] = _mm_extract_epi32(converted, 0);
600 z[1] = _mm_extract_epi32(converted, 1);
601 z[2] = _mm_extract_epi32(converted, 2);
602 z[3] = _mm_extract_epi32(converted, 3);
603 #else
604 __m128i shuffled1 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(0, 3, 2, 1));
605 __m128i shuffled2 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(1, 0, 3, 2));
606 __m128i shuffled3 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(2, 1, 0, 3));
608 _mm_store_ss((float*)z, (__m128)converted);
609 _mm_store_ss((float*)z+1, (__m128)shuffled1);
610 _mm_store_ss((float*)z+2, (__m128)shuffled2);
611 _mm_store_ss((float*)z+3, (__m128)shuffled3);
612 #endif
614 for (i = 0; i != 4; ++i) {
615 memcpy (dst, z+i, 3);
616 dst += dst_skip;
619 nsamples -= 4;
620 src += 4;
622 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
623 unsigned long unrolled = nsamples / 4;
624 while (unrolled--) {
625 int i;
626 int32_t z[4];
627 float32x4_t samples = vld1q_f32(src);
628 int32x4_t converted = float_24_neon(samples);
629 vst1q_s32(z, converted);
631 for (i = 0; i != 4; ++i) {
632 memcpy (dst, z+i, 3);
633 dst += dst_skip;
635 src += 4;
637 nsamples = nsamples & 3;
638 #endif
640 int32_t z;
642 while (nsamples--) {
643 float_24 (*src, z);
644 #if __BYTE_ORDER == __LITTLE_ENDIAN
645 memcpy (dst, &z, 3);
646 #elif __BYTE_ORDER == __BIG_ENDIAN
647 memcpy (dst, (char *)&z + 1, 3);
648 #endif
649 dst += dst_skip;
650 src++;
654 void sample_move_dS_s24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
656 const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
658 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
659 // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
660 const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
661 int32_t x[4];
662 memset(x, 0, sizeof(x));
663 unsigned long unrolled = nsamples / 4;
664 while (unrolled--) {
665 #if __BYTE_ORDER == __BIG_ENDIAN /* ARM big endian?? */
666 // right aligned / inverse sequence below -> *256
667 memcpy(((char*)&x[0])+1, src, 3);
668 memcpy(((char*)&x[1])+1, src+src_skip, 3);
669 memcpy(((char*)&x[2])+1, src+2*src_skip, 3);
670 memcpy(((char*)&x[3])+1, src+3*src_skip, 3);
671 #else
672 memcpy(&x[0], src, 3);
673 memcpy(&x[1], src+src_skip, 3);
674 memcpy(&x[2], src+2*src_skip, 3);
675 memcpy(&x[3], src+3*src_skip, 3);
676 #endif
677 src += 4 * src_skip;
679 int32x4_t source = vld1q_s32(x);
680 source = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(source)));
681 float32x4_t converted = vcvtq_f32_s32(source);
682 float32x4_t scaled = vmulq_f32(converted, vscaling);
683 vst1q_f32(dst, scaled);
684 dst += 4;
686 nsamples = nsamples & 3;
687 #endif
689 /* ALERT: signed sign-extension portability !!! */
691 while (nsamples--) {
692 int x;
693 #if __BYTE_ORDER == __LITTLE_ENDIAN
694 x = (unsigned char)(src[0]);
695 x <<= 8;
696 x |= (unsigned char)(src[1]);
697 x <<= 8;
698 x |= (unsigned char)(src[2]);
699 /* correct sign bit and the rest of the top byte */
700 if (src[0] & 0x80) {
701 x |= 0xff << 24;
703 #elif __BYTE_ORDER == __BIG_ENDIAN
704 x = (unsigned char)(src[2]);
705 x <<= 8;
706 x |= (unsigned char)(src[1]);
707 x <<= 8;
708 x |= (unsigned char)(src[0]);
709 /* correct sign bit and the rest of the top byte */
710 if (src[2] & 0x80) {
711 x |= 0xff << 24;
713 #endif
714 *dst = x * scaling;
715 dst++;
716 src += src_skip;
720 void sample_move_dS_s24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
722 const jack_default_audio_sample_t scaling = 1.f/SAMPLE_24BIT_SCALING;
724 #if defined (__SSE2__) && !defined (__sun__)
725 const __m128 scaling_block = _mm_set_ps1(scaling);
726 while (nsamples >= 4) {
727 int x0, x1, x2, x3;
729 memcpy((char*)&x0 + 1, src, 3);
730 memcpy((char*)&x1 + 1, src+src_skip, 3);
731 memcpy((char*)&x2 + 1, src+2*src_skip, 3);
732 memcpy((char*)&x3 + 1, src+3*src_skip, 3);
733 src += 4 * src_skip;
735 const __m128i block_i = _mm_set_epi32(x3, x2, x1, x0);
736 const __m128i shifted = _mm_srai_epi32(block_i, 8);
737 const __m128 converted = _mm_cvtepi32_ps (shifted);
738 const __m128 scaled = _mm_mul_ps(converted, scaling_block);
739 _mm_storeu_ps(dst, scaled);
740 dst += 4;
741 nsamples -= 4;
743 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
744 // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
745 const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
746 int32_t x[4];
747 memset(x, 0, sizeof(x));
748 unsigned long unrolled = nsamples / 4;
749 while (unrolled--) {
750 #if __BYTE_ORDER == __BIG_ENDIAN /* ARM big endian?? */
751 // left aligned -> *256
752 memcpy(&x[0], src, 3);
753 memcpy(&x[1], src+src_skip, 3);
754 memcpy(&x[2], src+2*src_skip, 3);
755 memcpy(&x[3], src+3*src_skip, 3);
756 #else
757 memcpy(((char*)&x[0])+1, src, 3);
758 memcpy(((char*)&x[1])+1, src+src_skip, 3);
759 memcpy(((char*)&x[2])+1, src+2*src_skip, 3);
760 memcpy(((char*)&x[3])+1, src+3*src_skip, 3);
761 #endif
762 src += 4 * src_skip;
764 int32x4_t source = vld1q_s32(x);
765 float32x4_t converted = vcvtq_f32_s32(source);
766 float32x4_t scaled = vmulq_f32(converted, vscaling);
767 vst1q_f32(dst, scaled);
768 dst += 4;
770 nsamples = nsamples & 3;
771 #endif
773 while (nsamples--) {
774 int x;
775 #if __BYTE_ORDER == __LITTLE_ENDIAN
776 memcpy((char*)&x + 1, src, 3);
777 #elif __BYTE_ORDER == __BIG_ENDIAN
778 memcpy(&x, src, 3);
779 #endif
780 x >>= 8;
781 *dst = x * scaling;
782 dst++;
783 src += src_skip;
788 void sample_move_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
790 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
791 unsigned long unrolled = nsamples / 4;
792 nsamples = nsamples & 3;
794 while (unrolled--) {
795 float32x4_t samples = vld1q_f32(src);
796 int16x4_t converted = float_16_neon(samples);
797 converted = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(converted)));
799 switch(dst_skip) {
800 case 2:
801 vst1_s16((int16_t*)dst, converted);
802 break;
803 default:
804 vst1_lane_s16((int16_t*)(dst), converted, 0);
805 vst1_lane_s16((int16_t*)(dst+dst_skip), converted, 1);
806 vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2);
807 vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3);
808 break;
810 dst += 4*dst_skip;
811 src+= 4;
813 #endif
814 int16_t tmp;
816 while (nsamples--) {
817 // float_16 (*src, tmp);
819 if (*src <= NORMALIZED_FLOAT_MIN) {
820 tmp = SAMPLE_16BIT_MIN;
821 } else if (*src >= NORMALIZED_FLOAT_MAX) {
822 tmp = SAMPLE_16BIT_MAX;
823 } else {
824 tmp = (int16_t) f_round (*src * SAMPLE_16BIT_SCALING);
827 #if __BYTE_ORDER == __LITTLE_ENDIAN
828 dst[0]=(char)(tmp>>8);
829 dst[1]=(char)(tmp);
830 #elif __BYTE_ORDER == __BIG_ENDIAN
831 dst[0]=(char)(tmp);
832 dst[1]=(char)(tmp>>8);
833 #endif
834 dst += dst_skip;
835 src++;
839 void sample_move_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
841 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
842 unsigned long unrolled = nsamples / 4;
843 nsamples = nsamples & 3;
845 while (unrolled--) {
846 float32x4_t samples = vld1q_f32(src);
847 int16x4_t converted = float_16_neon(samples);
849 switch(dst_skip) {
850 case 2:
851 vst1_s16((int16_t*)dst, converted);
852 break;
853 default:
854 vst1_lane_s16((int16_t*)(dst), converted, 0);
855 vst1_lane_s16((int16_t*)(dst+dst_skip), converted, 1);
856 vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2);
857 vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3);
858 break;
860 dst += 4*dst_skip;
861 src+= 4;
863 #endif
864 while (nsamples--) {
865 float_16 (*src, *((int16_t*) dst));
866 dst += dst_skip;
867 src++;
871 void sample_move_dither_rect_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
873 jack_default_audio_sample_t val;
874 int16_t tmp;
876 while (nsamples--) {
877 val = (*src * SAMPLE_16BIT_SCALING) + fast_rand() / (float) UINT_MAX - 0.5f;
878 float_16_scaled (val, tmp);
879 #if __BYTE_ORDER == __LITTLE_ENDIAN
880 dst[0]=(char)(tmp>>8);
881 dst[1]=(char)(tmp);
882 #elif __BYTE_ORDER == __BIG_ENDIAN
883 dst[0]=(char)(tmp);
884 dst[1]=(char)(tmp>>8);
885 #endif
886 dst += dst_skip;
887 src++;
891 void sample_move_dither_rect_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
893 jack_default_audio_sample_t val;
895 while (nsamples--) {
896 val = (*src * SAMPLE_16BIT_SCALING) + fast_rand() / (float)UINT_MAX - 0.5f;
897 float_16_scaled (val, *((int16_t*) dst));
898 dst += dst_skip;
899 src++;
903 void sample_move_dither_tri_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
905 jack_default_audio_sample_t val;
906 int16_t tmp;
908 while (nsamples--) {
909 val = (*src * SAMPLE_16BIT_SCALING) + ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
910 float_16_scaled (val, tmp);
912 #if __BYTE_ORDER == __LITTLE_ENDIAN
913 dst[0]=(char)(tmp>>8);
914 dst[1]=(char)(tmp);
915 #elif __BYTE_ORDER == __BIG_ENDIAN
916 dst[0]=(char)(tmp);
917 dst[1]=(char)(tmp>>8);
918 #endif
919 dst += dst_skip;
920 src++;
924 void sample_move_dither_tri_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
926 jack_default_audio_sample_t val;
928 while (nsamples--) {
929 val = (*src * SAMPLE_16BIT_SCALING) + ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
930 float_16_scaled (val, *((int16_t*) dst));
931 dst += dst_skip;
932 src++;
936 void sample_move_dither_shaped_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
938 jack_default_audio_sample_t x;
939 jack_default_audio_sample_t xe; /* the innput sample - filtered error */
940 jack_default_audio_sample_t xp; /* x' */
941 float r;
942 float rm1 = state->rm1;
943 unsigned int idx = state->idx;
944 int16_t tmp;
946 while (nsamples--) {
947 x = *src * SAMPLE_16BIT_SCALING;
948 r = ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
949 /* Filter the error with Lipshitz's minimally audible FIR:
950 [2.033 -2.165 1.959 -1.590 0.6149] */
951 xe = x
952 - state->e[idx] * 2.033f
953 + state->e[(idx - 1) & DITHER_BUF_MASK] * 2.165f
954 - state->e[(idx - 2) & DITHER_BUF_MASK] * 1.959f
955 + state->e[(idx - 3) & DITHER_BUF_MASK] * 1.590f
956 - state->e[(idx - 4) & DITHER_BUF_MASK] * 0.6149f;
957 xp = xe + r - rm1;
958 rm1 = r;
960 float_16_scaled (xp, tmp);
962 /* Intrinsic z^-1 delay */
963 idx = (idx + 1) & DITHER_BUF_MASK;
964 state->e[idx] = xp - xe;
966 #if __BYTE_ORDER == __LITTLE_ENDIAN
967 dst[0]=(char)(tmp>>8);
968 dst[1]=(char)(tmp);
969 #elif __BYTE_ORDER == __BIG_ENDIAN
970 dst[0]=(char)(tmp);
971 dst[1]=(char)(tmp>>8);
972 #endif
973 dst += dst_skip;
974 src++;
976 state->rm1 = rm1;
977 state->idx = idx;
980 void sample_move_dither_shaped_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
982 jack_default_audio_sample_t x;
983 jack_default_audio_sample_t xe; /* the innput sample - filtered error */
984 jack_default_audio_sample_t xp; /* x' */
985 float r;
986 float rm1 = state->rm1;
987 unsigned int idx = state->idx;
989 while (nsamples--) {
990 x = *src * SAMPLE_16BIT_SCALING;
991 r = ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
992 /* Filter the error with Lipshitz's minimally audible FIR:
993 [2.033 -2.165 1.959 -1.590 0.6149] */
994 xe = x
995 - state->e[idx] * 2.033f
996 + state->e[(idx - 1) & DITHER_BUF_MASK] * 2.165f
997 - state->e[(idx - 2) & DITHER_BUF_MASK] * 1.959f
998 + state->e[(idx - 3) & DITHER_BUF_MASK] * 1.590f
999 - state->e[(idx - 4) & DITHER_BUF_MASK] * 0.6149f;
1000 xp = xe + r - rm1;
1001 rm1 = r;
1003 float_16_scaled (xp, *((int16_t*) dst));
1005 /* Intrinsic z^-1 delay */
1006 idx = (idx + 1) & DITHER_BUF_MASK;
1007 state->e[idx] = *((int16_t*) dst) - xe;
1009 dst += dst_skip;
1010 src++;
1012 state->rm1 = rm1;
1013 state->idx = idx;
1016 void sample_move_dS_s16s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
1018 short z;
1019 const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
1020 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1021 const float32x4_t vscaling = vdupq_n_f32(scaling);
1022 unsigned long unrolled = nsamples / 4;
1023 while (unrolled--) {
1024 int16x4_t source16x4;
1025 switch(src_skip) {
1026 case 2:
1027 source16x4 = vld1_s16((int16_t*)src);
1028 break;
1029 case 4:
1030 source16x4 = vld2_s16((int16_t*)src).val[0];
1031 break;
1032 default:
1033 source16x4 = vld1_lane_s16((int16_t*)src, source16x4, 0);
1034 source16x4 = vld1_lane_s16((int16_t*)(src+src_skip), source16x4, 1);
1035 source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2);
1036 source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3);
1037 break;
1039 source16x4 = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(source16x4)));
1040 int32x4_t source32x4 = vmovl_s16(source16x4);
1041 src += 4 * src_skip;
1043 float32x4_t converted = vcvtq_f32_s32(source32x4);
1044 float32x4_t scaled = vmulq_f32(converted, vscaling);
1045 vst1q_f32(dst, scaled);
1046 dst += 4;
1048 nsamples = nsamples & 3;
1049 #endif
1051 /* ALERT: signed sign-extension portability !!! */
1052 while (nsamples--) {
1053 #if __BYTE_ORDER == __LITTLE_ENDIAN
1054 z = (unsigned char)(src[0]);
1055 z <<= 8;
1056 z |= (unsigned char)(src[1]);
1057 #elif __BYTE_ORDER == __BIG_ENDIAN
1058 z = (unsigned char)(src[1]);
1059 z <<= 8;
1060 z |= (unsigned char)(src[0]);
1061 #endif
1062 *dst = z * scaling;
1063 dst++;
1064 src += src_skip;
1068 void sample_move_dS_s16 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
1070 /* ALERT: signed sign-extension portability !!! */
1071 const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
1072 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1073 const float32x4_t vscaling = vdupq_n_f32(scaling);
1074 unsigned long unrolled = nsamples / 4;
1075 while (unrolled--) {
1076 int16x4_t source16x4;
1077 switch(src_skip) {
1078 case 2:
1079 source16x4 = vld1_s16((int16_t*)src);
1080 break;
1081 case 4:
1082 source16x4 = vld2_s16((int16_t*)src).val[0];
1083 break;
1084 default:
1085 source16x4 = vld1_lane_s16((int16_t*)src, source16x4, 0);
1086 source16x4 = vld1_lane_s16((int16_t*)(src+src_skip), source16x4, 1);
1087 source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2);
1088 source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3);
1089 break;
1091 int32x4_t source32x4 = vmovl_s16(source16x4);
1092 src += 4 * src_skip;
1094 float32x4_t converted = vcvtq_f32_s32(source32x4);
1095 float32x4_t scaled = vmulq_f32(converted, vscaling);
1096 vst1q_f32(dst, scaled);
1097 dst += 4;
1099 nsamples = nsamples & 3;
1100 #endif
1102 while (nsamples--) {
1103 *dst = (*((short *) src)) * scaling;
1104 dst++;
1105 src += src_skip;
1109 void memset_interleave (char *dst, char val, unsigned long bytes,
1110 unsigned long unit_bytes,
1111 unsigned long skip_bytes)
1113 switch (unit_bytes) {
1114 case 1:
1115 while (bytes--) {
1116 *dst = val;
1117 dst += skip_bytes;
1119 break;
1120 case 2:
1121 while (bytes) {
1122 *((short *) dst) = (short) val;
1123 dst += skip_bytes;
1124 bytes -= 2;
1126 break;
1127 case 4:
1128 while (bytes) {
1129 *((int *) dst) = (int) val;
1130 dst += skip_bytes;
1131 bytes -= 4;
1133 break;
1134 default:
1135 while (bytes) {
1136 memset(dst, val, unit_bytes);
1137 dst += skip_bytes;
1138 bytes -= unit_bytes;
1140 break;
1144 /* COPY FUNCTIONS: used to move data from an input channel to an
1145 output channel. Note that we assume that the skip distance
1146 is the same for both channels. This is completely fine
1147 unless the input and output were on different audio interfaces that
1148 were interleaved differently. We don't try to handle that.
1151 void
1152 memcpy_fake (char *dst, char *src, unsigned long src_bytes, unsigned long foo, unsigned long bar)
1154 memcpy (dst, src, src_bytes);
1157 void
1158 memcpy_interleave_d16_s16 (char *dst, char *src, unsigned long src_bytes,
1159 unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
1161 while (src_bytes) {
1162 *((short *) dst) = *((short *) src);
1163 dst += dst_skip_bytes;
1164 src += src_skip_bytes;
1165 src_bytes -= 2;
1169 void
1170 memcpy_interleave_d24_s24 (char *dst, char *src, unsigned long src_bytes,
1171 unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
1173 while (src_bytes) {
1174 memcpy(dst, src, 3);
1175 dst += dst_skip_bytes;
1176 src += src_skip_bytes;
1177 src_bytes -= 3;
1181 void
1182 memcpy_interleave_d32_s32 (char *dst, char *src, unsigned long src_bytes,
1183 unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
1185 while (src_bytes) {
1186 *((int *) dst) = *((int *) src);
1187 dst += dst_skip_bytes;
1188 src += src_skip_bytes;
1189 src_bytes -= 4;