Fix looping... to some extent.
[calfbox.git] / sampler_gen.c
blob5259743dad3f29a11396f187978c320e05d1d23f
1 /*
2 Calf Box, an open source musical instrument.
3 Copyright (C) 2010-2013 Krzysztof Foltman
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
19 #include "config-api.h"
20 #include "dspmath.h"
21 #include "errors.h"
22 #include "midi.h"
23 #include "module.h"
24 #include "rt.h"
25 #include "sampler.h"
26 #include "sfzloader.h"
27 #include <assert.h>
28 #include <errno.h>
29 #include <math.h>
30 #include <memory.h>
31 #include <stdio.h>
33 #define LOW_QUALITY_INTERPOLATION 0
35 struct resampler_state
37 float *leftright;
38 int offset;
39 float lgain, rgain, lgain_delta, rgain_delta;
42 #if USE_NEON
44 #include <arm_neon.h>
46 static inline void process_voice_mono_noloop(struct sampler_gen *v, struct resampler_state *rs, const int16_t *srcdata, int endpos)
48 static const float32x2_t shift1a = {0.f, 1.f}, shift1b = {1.f, 1.f};
49 static const float32x2_t shift2a = {-1.f, -1.f}, shift2b = {0.f, 0.f};
50 static const float32x2_t shift3a = {-2.f, -2.f}, shift3b = {-2.f, -1.f};
51 static const float32x2_t scalinga = {-1 / 6.0, 3 / 6.0}, scalingb = {-3 / 6.0, 1 / 6.0};
52 uint64x1_t pos = v->bigpos, delta = v->bigdelta;
53 float32x2_t gains = {rs->lgain, rs->rgain};
54 const float32x2_t gaindeltas = {rs->lgain_delta, rs->rgain_delta};
55 for (uint32_t i = rs->offset; i < endpos; i++)
57 float32x2_t posposf = vcvt_n_f32_u32(vreinterpret_u32_u64(pos), 32);
59 int32x4_t smp = vmovl_s16(vld1_s16(&srcdata[pos >> 32]));
60 pos = vadd_u64(pos, delta);
62 float32x2_t t2 = vdup_n_f32(posposf[0]);
63 float32x2_t samplesa = vcvt_f32_s32(vget_low_s32(smp)), samplesb = vcvt_f32_s32(vget_high_s32(smp));
65 float32x2_t mula = vmul_f32(vmul_f32(vadd_f32(t2, shift1a), vadd_f32(t2, shift2a)), vmul_f32(vadd_f32(t2, shift3a), scalinga));
66 float32x2_t mulb = vmul_f32(vmul_f32(vadd_f32(t2, shift1b), vadd_f32(t2, shift2b)), vmul_f32(vadd_f32(t2, shift3b), scalingb));
67 float32x2_t v = vmla_f32(vmul_f32(samplesa, mula), samplesb, mulb);
68 float32x2_t result = vmul_f32(gains, vadd_f32(v, vrev64_f32(v)));
69 gains = vadd_f32(gains, gaindeltas);
71 rs->leftright[2 * i] = result[0];
72 rs->leftright[2 * i + 1] = result[1];
74 rs->lgain = gains[0];
75 rs->rgain = gains[1];
76 v->bigpos = pos;
77 rs->offset = endpos;
80 static inline void process_voice_stereo_noloop(struct sampler_gen *v, struct resampler_state *rs, const int16_t *srcdata, int endpos)
82 static const float32x2_t shift1a = {0.f, 1.f}, shift1b = {1.f, 1.f};
83 static const float32x2_t shift2a = {-1.f, -1.f}, shift2b = {0.f, 0.f};
84 static const float32x2_t shift3a = {-2.f, -2.f}, shift3b = {-2.f, -1.f};
85 static const float32x2_t scalinga = {-1 / 6.0, 3 / 6.0}, scalingb = {-3 / 6.0, 1 / 6.0};
86 uint64x1_t pos = v->bigpos, delta = v->bigdelta;
87 float32x2_t gains = {rs->lgain, rs->rgain};
88 const float32x2_t gaindeltas = {rs->lgain_delta, rs->rgain_delta};
89 for (uint32_t i = rs->offset; i < endpos; i++)
91 float32x2_t posposf = vcvt_n_f32_u32(vreinterpret_u32_u64(pos), 32);
93 int16x4x2_t pp = vld2_s16(&srcdata[(pos >> 31) &~ 1]);
94 pos = vadd_u64(pos, delta);
95 int32x4_t smp_left = vmovl_s16(pp.val[0]), smp_right = vmovl_s16(pp.val[1]);
97 float32x2_t t2 = vdup_n_f32(posposf[0]);
98 float32x2_t samplesLa = vcvt_f32_s32(vget_low_s32(smp_left)), samplesLb = vcvt_f32_s32(vget_high_s32(smp_left));
99 float32x2_t samplesRa = vcvt_f32_s32(vget_low_s32(smp_right)), samplesRb = vcvt_f32_s32(vget_high_s32(smp_right));
101 float32x2_t mula = vmul_f32(vmul_f32(vadd_f32(t2, shift1a), vadd_f32(t2, shift2a)), vmul_f32(vadd_f32(t2, shift3a), scalinga));
102 float32x2_t mulb = vmul_f32(vmul_f32(vadd_f32(t2, shift1b), vadd_f32(t2, shift2b)), vmul_f32(vadd_f32(t2, shift3b), scalingb));
103 float32x2_t vL = vmla_f32(vmul_f32(samplesLa, mula), samplesLb, mulb);
104 float32x2_t vR = vmla_f32(vmul_f32(samplesRa, mula), samplesRb, mulb);
105 float32x2x2_t transposed = vtrn_f32(vL, vR);
106 float32x2_t result = vmul_f32(gains, vadd_f32(transposed.val[0], transposed.val[1]));
107 gains = vadd_f32(gains, gaindeltas);
109 rs->leftright[2 * i] = result[0];
110 rs->leftright[2 * i + 1] = result[1];
112 rs->lgain = gains[0];
113 rs->rgain = gains[1];
114 v->bigpos = pos;
115 rs->offset = endpos;
118 #elif USE_SSE
120 #include <xmmintrin.h>
122 typedef __m128 V4SF;
124 static const V4SF shift1 = {0, 1, 1, 1};
125 static const V4SF shift2 = {-1, -1, 0, 0};
126 static const V4SF shift3 = {-2, -2, -2, -1};
127 static const V4SF scaling = {-1, 3, -3, 1};
128 static const V4SF zero = {0, 0, 0, 0};
131 static inline void process_voice_mono_noloop(struct sampler_gen *v, struct resampler_state *rs, const int16_t *srcdata, int endpos)
133 uint64_t pos = v->bigpos;
134 const float ffrac = 1.0f / 6.0f;
135 const float _scaler = 1.f / (128.f * 16777216.f);
136 for (int i = rs->offset; i < endpos; i++)
138 //float t = ((pos >> 8) & 0x00FFFFFF) * scaler;
139 const int16_t *p = &srcdata[pos >> 32];
141 V4SF t2 = __builtin_ia32_cvtsi2ss(zero, (pos & 0xFFFFFFFF) >> 1) * _scaler;
142 pos += v->bigdelta;
144 V4SF t4 = __builtin_ia32_shufps(t2, t2, 0);
145 V4SF v4mul = (t4 + shift1) * (t4 + shift2) * (t4 + shift3) * scaling;
146 V4SF samples = {p[0], p[1], p[2], p[3]};
147 v4mul = __builtin_ia32_mulps(samples, v4mul);
149 float c = (v4mul[0] + v4mul[1] + v4mul[2] + v4mul[3]) * ffrac;
151 rs->leftright[2 * i] = rs->lgain * c;
152 rs->leftright[2 * i + 1] = rs->rgain * c;
153 rs->lgain += rs->lgain_delta;
154 rs->rgain += rs->rgain_delta;
156 v->bigpos = pos;
157 rs->offset = endpos;
160 static inline void process_voice_stereo_noloop(struct sampler_gen *v, struct resampler_state *rs, const int16_t *srcdata, int endpos)
162 uint64_t pos = v->bigpos;
163 const float ffrac = 1.0f / 6.0f;
164 const float _scaler = 1.f / (128.f * 16777216.f);
165 for (int i = rs->offset; i < endpos; i++)
167 //float t = ((pos >> 8) & 0x00FFFFFF) * scaler;
168 const int16_t *p = &srcdata[(pos >> 31) & ~1];
170 V4SF t2 = __builtin_ia32_cvtsi2ss(zero, (pos & 0xFFFFFFFF) >> 1) * _scaler;
171 pos += v->bigdelta;
173 V4SF t4 = __builtin_ia32_shufps(t2, t2, 0);
174 V4SF v4mul = (t4 + shift1) * (t4 + shift2) * (t4 + shift3) * scaling;
175 V4SF samples_left = {p[0], p[2], p[4], p[6]};
176 samples_left = __builtin_ia32_mulps(samples_left, v4mul);
177 V4SF samples_right = {p[1], p[3], p[5], p[7]};
178 samples_right = __builtin_ia32_mulps(samples_right, v4mul);
180 float cl = (samples_left[0] + samples_left[1] + samples_left[2] + samples_left[3]) * ffrac;
181 float cr = (samples_right[0] + samples_right[1] + samples_right[2] + samples_right[3]) * ffrac;
183 rs->leftright[2 * i] = rs->lgain * cl;
184 rs->leftright[2 * i + 1] = rs->rgain * cr;
185 rs->lgain += rs->lgain_delta;
186 rs->rgain += rs->rgain_delta;
188 v->bigpos = pos;
189 rs->offset = endpos;
192 #else
194 static inline void process_voice_mono_noloop(struct sampler_gen *v, struct resampler_state *rs, const int16_t *srcdata, int endpos)
196 const float ffrac = 1.0f / 6.0f;
197 const float scaler = 1.f / 16777216.f;
199 for (int i = rs->offset; i < endpos; i++)
201 float t = ((v->bigpos >> 8) & 0x00FFFFFF) * scaler;
202 const int16_t *p = &srcdata[v->bigpos >> 32];
203 #if LOW_QUALITY_INTERPOLATION
204 float c = (1.f - t) * p[1] + t * p[2];
205 #else
206 float b0 = -t*(t-1.f)*(t-2.f);
207 float b1 = 3.f*(t+1.f)*(t-1.f)*(t-2.f);
208 float c = (b0 * p[0] + b1 * p[1] - 3.f*(t+1.f)*t*(t-2.f) * p[2] + (t+1.f)*t*(t-1.f) * p[3]) * ffrac;
209 #endif
210 rs->leftright[2 * i] = rs->lgain * c;
211 rs->leftright[2 * i + 1] = rs->rgain * c;
212 rs->lgain += rs->lgain_delta;
213 rs->rgain += rs->rgain_delta;
214 v->bigpos += v->bigdelta;
216 rs->offset = endpos;
219 static inline void process_voice_stereo_noloop(struct sampler_gen *v, struct resampler_state *rs, const int16_t *srcdata, int endpos)
221 const float ffrac = 1.0f / 6.0f;
222 const float scaler = 1.f / 16777216.f;
224 for (int i = rs->offset; i < endpos; i++)
226 float t = ((v->bigpos >> 8) & 0x00FFFFFF) * scaler;
227 const int16_t *p = &srcdata[(v->bigpos >> 31) & ~1];
228 #if LOW_QUALITY_INTERPOLATION
229 float c0 = (1.f - t) * p[2] + t * p[4];
230 float c1 = (1.f - t) * p[3] + t * p[5];
231 #else
232 float b0 = -t*(t-1.f)*(t-2.f);
233 float b1 = 3.f*(t+1.f)*(t-1.f)*(t-2.f);
234 float c0 = (b0 * p[0] + b1 * p[2] - 3.f*(t+1.f)*t*(t-2.f) * p[4] + (t+1.f)*t*(t-1.f) * p[6]) * ffrac;
235 float c1 = (b0 * p[1] + b1 * p[3] - 3.f*(t+1.f)*t*(t-2.f) * p[5] + (t+1.f)*t*(t-1.f) * p[7]) * ffrac;
236 #endif
237 rs->leftright[2 * i] = rs->lgain * c0;
238 rs->leftright[2 * i + 1] = rs->rgain * c1;
239 rs->lgain += rs->lgain_delta;
240 rs->rgain += rs->rgain_delta;
241 v->bigpos += v->bigdelta;
243 rs->offset = endpos;
246 #endif
248 static inline uint32_t process_voice_noloop(struct sampler_gen *v, struct resampler_state *rs, const int16_t *srcdata, uint32_t pos_offset, uint32_t usable_sample_end)
250 uint32_t out_frames = CBOX_BLOCK_SIZE - rs->offset;
252 uint64_t sample_end64 = ((uint64_t)usable_sample_end) << 32;
253 // Check how many frames can be written to output buffer without going
254 // past usable_sample_end.
255 if (__builtin_expect(v->bigpos + (out_frames - 1) * v->bigdelta >= sample_end64, 0))
256 out_frames = (sample_end64 - v->bigpos) / v->bigdelta + 1;
258 assert(out_frames > 0 && out_frames <= CBOX_BLOCK_SIZE - rs->offset);
259 uint32_t oldpos = v->bigpos >> 32;
260 if (v->mode == spt_stereo16)
261 process_voice_stereo_noloop(v, rs, srcdata - (pos_offset << 1), rs->offset + out_frames);
262 else
263 process_voice_mono_noloop(v, rs, srcdata - pos_offset, rs->offset + out_frames);
264 return (v->bigpos >> 32) - oldpos;
267 static void process_voice_withloop(struct sampler_gen *v, struct resampler_state *rs)
269 // This is the first frame where interpolation will cross the loop boundary
270 uint32_t loop_end = v->loop_end;
271 uint32_t loop_edge = loop_end - MAX_INTERPOLATION_ORDER;
273 while ( rs->offset < CBOX_BLOCK_SIZE ) {
274 uint64_t startframe = v->bigpos >> 32;
276 int16_t *source_data = v->sample_data;
277 uint32_t source_offset = 0;
278 uint32_t usable_sample_end = loop_edge;
279 // if the first frame to play is already within 3 frames of loop end
280 // (we need consecutive 4 frames for cubic interpolation) then
281 // "straighten out" the area around the loop, and play that
282 if (__builtin_expect(startframe >= loop_edge, 0))
284 // if fully past the loop end, then it's normal wraparound
285 // (or end of the sample if not looping)
286 if (startframe >= loop_end)
288 if (v->loop_start == (uint32_t)-1)
290 v->mode = spt_inactive;
291 return;
293 v->play_count++;
294 if (v->loop_count && v->play_count >= v->loop_count)
296 v->mode = spt_inactive;
297 return;
299 v->bigpos -= (uint64_t)(loop_end - v->loop_start) << 32;
300 continue;
303 usable_sample_end = loop_end;
304 source_data = v->scratch;
305 source_offset = loop_edge;
308 process_voice_noloop(v, rs, source_data, source_offset, usable_sample_end);
312 static void process_voice_streaming(struct sampler_gen *v, struct resampler_state *rs, uint32_t limit)
314 if (v->consumed_credit > 0)
316 if (v->consumed_credit >= limit)
318 v->consumed_credit -= limit;
319 return;
321 limit -= v->consumed_credit;
322 v->consumed_credit = 0;
324 // This is the first frame where interpolation will cross the loop boundary
325 int16_t scratch[2 * MAX_INTERPOLATION_ORDER * 2];
327 while ( limit && rs->offset < CBOX_BLOCK_SIZE ) {
328 uint64_t startframe = v->bigpos >> 32;
330 int16_t *source_data = v->in_streaming_buffer ? v->streaming_buffer : v->sample_data;
331 uint32_t loop_start = v->in_streaming_buffer ? 0 : v->loop_start;
332 uint32_t loop_end = v->in_streaming_buffer ? v->streaming_buffer_frames : v->loop_end;
333 uint32_t loop_edge = loop_end - MAX_INTERPOLATION_ORDER;
334 uint32_t source_offset = 0;
335 uint32_t usable_sample_end = loop_edge;
336 // if the first frame to play is already within 3 frames of loop end
337 // (we need consecutive 4 frames for cubic interpolation) then
338 // "straighten out" the area around the loop, and play that
339 if (startframe >= loop_edge)
341 // if fully past the loop end, then it's normal wraparound
342 // (or end of the sample if not looping)
343 if (startframe >= loop_end)
345 if (v->loop_start == (uint32_t)-1)
347 v->mode = spt_inactive;
348 return;
350 v->bigpos -= (uint64_t)(loop_end - loop_start) << 32;
351 if (v->prefetch_only_loop)
352 v->consumed -= (loop_end - loop_start);
353 else
354 v->in_streaming_buffer = TRUE;
355 continue;
358 int shift = (v->mode == spt_stereo16) ? 1 : 0;
360 // 'linearize' the virtual circular buffer - write 3 (or N) frames before end of the loop
361 // and 3 (N) frames at the start of the loop, and play it; in rare cases this will need to be
362 // repeated twice if output write pointer is close to CBOX_BLOCK_SIZE or playback rate is very low,
363 // but that's OK.
364 uint32_t halfscratch = MAX_INTERPOLATION_ORDER << shift;
365 memcpy(&scratch[0], &source_data[loop_edge << shift], halfscratch * sizeof(int16_t) );
366 if (v->loop_start == (uint32_t)-1)
367 memset(scratch + halfscratch, 0, halfscratch * sizeof(int16_t));
368 else
369 memcpy(scratch + halfscratch, &v->streaming_buffer[v->loop_start << shift], halfscratch * sizeof(int16_t));
371 usable_sample_end = loop_end;
372 source_data = scratch;
373 source_offset = loop_edge;
375 if (limit != (uint32_t)-1 && usable_sample_end - startframe > limit)
376 usable_sample_end = startframe + limit;
378 uint32_t consumed = process_voice_noloop(v, rs, source_data, source_offset, usable_sample_end);
379 if (consumed > limit)
381 // The number of frames 'consumed' may be greater than the amount
382 // available because of sample-skipping (at least that's the only
383 // *legitimate* reason). This should be accounted for in the,
384 // consumed sample counter (hence temporary storage of the
385 // 'buffer overconsumption' in the consumed_credit field), but is not
386 // actually causing any use of missing data, as the missing samples
387 // have been skipped.
388 assert(v->consumed_credit == 0);
389 v->consumed_credit = consumed - limit;
390 assert (v->consumed_credit <= 1 + (v->bigdelta >> 32));
391 consumed = limit;
393 v->consumed += consumed;
394 if (consumed < limit)
395 limit -= consumed;
396 else
397 break;
401 void sampler_gen_reset(struct sampler_gen *v)
403 v->mode = spt_inactive;
404 v->bigpos = 0;
405 v->last_lgain = 0.f;
406 v->last_rgain = 0.f;
407 v->play_count = 0;
408 v->consumed = 0;
409 v->consumed_credit = 0;
410 v->streaming_buffer = NULL;
411 v->in_streaming_buffer = FALSE;
412 v->prefetch_only_loop = FALSE;
413 v->fadein_counter = -1.f;
416 uint32_t sampler_gen_sample_playback(struct sampler_gen *v, float *leftright, uint32_t limit)
418 struct resampler_state rs;
419 rs.leftright = leftright;
420 rs.offset = 0;
421 rs.lgain = v->last_lgain;
422 rs.rgain = v->last_rgain;
423 rs.lgain_delta = (v->lgain - v->last_lgain) * (1.f / CBOX_BLOCK_SIZE);
424 rs.rgain_delta = (v->rgain - v->last_rgain) * (1.f / CBOX_BLOCK_SIZE);
425 if (v->streaming_buffer)
426 process_voice_streaming(v, &rs, limit);
427 else
429 process_voice_withloop(v, &rs);
431 uint32_t written = rs.offset;
433 if (!v->streaming_buffer)
435 v->virtpos += written * v->virtdelta;
436 if (v->virtpos != v->bigpos)
438 while ((v->virtpos >> 32) >= v->loop_end && v->loop_start != -1)
439 v->virtpos -= ((uint64_t)(v->loop_end - v->loop_start)) << 32;
441 // XXXKF looping
442 if (v->fadein_counter == -1 && fabs((v->bigpos - v->virtpos) / (65536.0 * 65536.0)) > v->stretching_jump)
444 int64_t jump = (int64_t)(v->stretching_jump * 65536.0 * 65536.0);
445 int64_t newpos = v->bigpos > v->virtpos ? v->bigpos - jump : v->bigpos + jump;
446 if (newpos < 0)
447 newpos = 0;
448 // XXXKF beware of extremely short loops
449 while ((newpos >> 32) >= v->loop_end && v->loop_start != -1)
450 newpos -= ((uint64_t)(v->loop_end - v->loop_start)) << 32;
451 if ((newpos >> 32) >= v->cur_sample_end - 4)
452 newpos = ((uint64_t)v->cur_sample_end - 4)<< 32;
453 v->fadein_pos = newpos;
454 v->fadein_counter = 0;
456 else if (v->fadein_counter != -1)
458 float leftright_fadein[2 * CBOX_BLOCK_SIZE];
460 rs.offset = 0;
461 rs.leftright = leftright_fadein;
462 rs.lgain = v->last_lgain;
463 rs.rgain = v->last_rgain;
465 uint64_t oldpos = v->bigpos;
466 v->bigpos = v->fadein_pos;
467 process_voice_withloop(v, &rs);
468 v->fadein_pos = v->bigpos;
469 v->bigpos = oldpos;
471 uint32_t written2 = rs.offset;
473 // XXXKF not the best set of special cases
474 int i;
475 if (written2 > written)
477 for (i = 2 * written; i < 2 * written2; i += 2)
478 leftright[i] = leftright[i + 1] = 0.f;
479 written = written2;
481 if (written2 < written)
483 for (i = 2 * written2; i < 2 * written; i += 2)
484 leftright_fadein[i] = leftright_fadein[i + 1] = 0.f;
485 written2 = written;
487 float cnt = v->fadein_counter;
488 float scl = v->bigdelta / (v->stretching_crossfade * v->virtdelta);
489 for (i = 0; i < 2 * written2; i += 2)
491 leftright[i] += (leftright_fadein[i] - leftright[i]) * cnt;
492 leftright[i + 1] += (leftright_fadein[i + 1] - leftright[i + 1]) * cnt;
493 cnt += scl;
494 if (cnt > 1.f)
495 cnt = 1.f;
497 if (cnt >= 1.f)
499 cnt = -1.f;
500 v->bigpos = v->fadein_pos;
502 v->fadein_counter = cnt;
505 v->last_lgain = v->lgain;
506 v->last_rgain = v->rgain;
507 return written;